diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml index 75aeab64b7f..9f35f07d885 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs.yaml @@ -82,6 +82,8 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -90,26 +92,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -117,40 +127,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 59136 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59136 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: -1 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -165,16 +179,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -182,27 +197,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 16 - NumLoadsB: 12 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -210,65 +231,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -277,25 +317,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -304,45 +352,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -352,16 +404,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 4] + MIWaveTileA: 7 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 224 + MacroTile1: 128 + MacroTileA: 224 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -369,27 +422,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 7 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 64 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -397,65 +456,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 28 + ThreadTile1: 4 + ThreadTileA: 28 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -464,26 +542,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -491,45 +577,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 4 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -540,15 +630,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 112 + MacroTileA: 128 + MacroTileB: 112 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -556,27 +647,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 7 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 64 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -584,65 +681,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -651,26 +767,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -678,40 +802,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x208x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 47616 LdsInitCVgprs: false - LdsNumBytes: 61568 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 28288 + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61568 - LdsOffsetMetadata_Blk: 98816 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: -1 - LocalReadVectorWidth: 4 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -726,16 +854,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 13] - MIWaveTileA: 4 - MIWaveTileB: 13 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 208 - MacroTileA: 256 - MacroTileB: 208 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -743,27 +872,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 208 - NumGlobalWriteVectorsPerThread: 52 - NumLoadsA: 16 - NumLoadsB: 13 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 13 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -771,93 +906,120 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x208x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 13 - ThreadTileA: 16 - ThreadTileB: 13 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -865,21 +1027,24 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 LdsInitCVgprs: false LdsNumBytes: 45056 LdsNumElementsAlignedA: 17408 @@ -896,14 +1061,15 @@ LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -914,15 +1080,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -930,20 +1097,25 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 NumLoadsA: 4 NumLoadsB: 6 NumLoadsCoalescedA: 1 @@ -951,6 +1123,7 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -958,65 +1131,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 1024 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -1025,26 +1217,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1052,40 +1252,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 20992 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20992 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -1100,16 +1304,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -1117,27 +1322,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -1145,93 +1356,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6n0vym1x7bi1E9oc9evhO3nLfooFqnChTUcIy8TkK-Rw= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 6 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1239,45 +1478,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -1287,16 +1530,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -1304,27 +1548,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -1332,93 +1582,120 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU6_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 6] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3Ruj2k0CBqbLZ8JvMlKqO9d_oQPw5DN9SC6dTYFm9Z2k= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -1427,40 +1704,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 20992 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20992 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -1475,16 +1756,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -1492,27 +1774,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -1520,36 +1808,45 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 1 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 1 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -1559,26 +1856,36 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -1587,25 +1894,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -1614,40 +1929,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 59136 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59136 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: -1 - LocalReadVectorWidth: 4 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -1662,16 +1981,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -1679,27 +1999,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 16 - NumLoadsB: 12 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -1707,93 +2033,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 6 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 6 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2WlTkTINN-uQ4kK8MCPJn8gb1JAv-YiXW3W6_kM0Fr7Y= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 6 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1801,45 +2155,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -1849,16 +2207,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -1866,27 +2225,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 64 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -1894,66 +2259,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU6_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 6] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -1962,26 +2345,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1989,45 +2380,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 48640 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 48640 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -2037,16 +2432,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 48 + MacroTile1: 256 MacroTileA: 128 - MacroTileB: 48 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -2054,27 +2450,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -2082,65 +2484,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -2149,26 +2570,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2176,45 +2605,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -2224,16 +2657,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -2241,27 +2675,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 6 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -2269,92 +2709,120 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1S0LMynLnaTdUVBnZEr08bLnE2Pmys_Pb8d-Px-pQ-ic= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -2363,10 +2831,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -2378,10 +2848,11 @@ LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 27648 + LdsNumBytes: 32768 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 10240 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -2390,13 +2861,14 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 + LdsOffsetMetadata: 32768 LdsOffsetMetadata_Blk: 50176 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -2412,15 +2884,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] + MIWaveTile: [4, 3] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -2428,27 +2901,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -2456,36 +2935,45 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -2495,54 +2983,73 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 3 - - 1LDSBuffer: 0 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3GAp9JiY5h4DTv8jKeS7AR5SWqDDY7JcuV16QR77IM4Q= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2550,45 +3057,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 128 - LSCB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 + LVCA: 32 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -2598,16 +3109,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -2615,27 +3127,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -2643,65 +3161,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -2710,26 +3247,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2737,10 +3282,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -2749,33 +3296,35 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -2785,16 +3334,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -2802,27 +3352,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -2830,93 +3386,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 1024 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3J0gg5kqXZF0KfELk9v_HY8do53h4tRIr53if5_CEn3c= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 11 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2924,40 +3508,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 27648 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -2972,16 +3560,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -2989,27 +3578,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -3017,65 +3612,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU11_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 11] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -3084,26 +3698,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3111,45 +3733,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 26624 + LdsNumBytes: 34816 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -3160,15 +3786,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -3176,27 +3803,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -3204,93 +3837,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 - StaggerUStride: 1024 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT68Uoqhgtp2fYhMimXjFGsonsfFHxf6iyalJrhfq4fWjk= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3298,10 +3959,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -3310,33 +3973,35 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 35840 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35840 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -3346,16 +4011,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -3363,20 +4029,25 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -3384,6 +4055,7 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -3391,65 +4063,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -3458,26 +4149,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3485,10 +4184,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -3497,28 +4198,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 19456 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 19456 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -3534,15 +4237,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -3550,27 +4254,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -3578,65 +4288,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -3645,26 +4374,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3672,40 +4409,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 78336 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 78336 + LdsOffsetMetadata: 59392 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -3720,16 +4461,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 128 - MacroTileA: 80 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -3737,27 +4479,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 5 - NumLoadsB: 4 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -3765,93 +4513,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 2 - ThreadTileA: 20 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6IdUH8OtJVmXYKaFIMvWv898-ZHBTc6DK_CCX4uYNti4= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3859,45 +4635,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false LSCA: 128 LSCB: 128 - LSPA: 8 - LSPB: 8 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -3907,16 +4687,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -3924,27 +4705,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -3952,65 +4739,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -4019,25 +4825,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 9 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -4046,40 +4860,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -4094,16 +4912,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -4111,27 +4930,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -4139,66 +4964,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU9_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 9] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -4207,25 +5050,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 9 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -4234,10 +5085,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -4247,27 +5100,29 @@ LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -4282,16 +5137,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -4299,27 +5155,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -4327,65 +5189,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU9_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 - StaggerUStride: 512 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 9] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -4394,25 +5275,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 9 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -4421,40 +5310,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -4469,16 +5362,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -4486,27 +5380,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -4514,94 +5414,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU9_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 9] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT17WqcefPXdcrzUDZYyTntAaG31x5oE5tdtfZJxIy6cOE= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4609,40 +5536,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 5120 + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 41472 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -4657,16 +5588,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -4674,27 +5606,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -4702,93 +5640,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6uVEKvoux042eSGfh2ID09wQ2hf8OUYVXnLA-IcXs7o4= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4796,10 +5762,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -4808,28 +5776,30 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 46080 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -4845,15 +5815,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -4861,27 +5832,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -4889,65 +5866,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x160x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 1024 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -4956,26 +5952,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4983,25 +5987,28 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 51200 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -5010,18 +6017,19 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 + LdsOffsetMetadata: 59904 LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -5031,16 +6039,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -5048,27 +6057,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -5076,65 +6091,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -5143,26 +6177,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5170,40 +6212,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60928 LdsInitCVgprs: false - LdsNumBytes: 53760 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 40960 + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 78336 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53760 - LdsOffsetMetadata_Blk: 78336 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -5219,15 +6265,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [5, 4] - MIWaveTileA: 5 + MIWaveTile: [12, 4] + MIWaveTileA: 12 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 80 + MacroTile0: 192 MacroTile1: 256 - MacroTileA: 80 + MacroTileA: 192 MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -5235,27 +6282,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularA: 6 NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -5263,93 +6316,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 + ThreadTile0: 48 ThreadTile1: 4 - ThreadTileA: 20 + ThreadTileA: 48 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1raku-cxrZMWbO0CC2AjI-4ie8QGCknrirKHbhwgtns4= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5357,40 +6438,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 83968 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 83968 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -5405,16 +6490,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -5422,27 +6508,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -5450,65 +6542,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -5517,26 +6628,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5544,10 +6663,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -5556,33 +6677,35 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 53760 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 36864 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53760 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 4 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -5592,16 +6715,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 112 + MacroTileA: 128 + MacroTileB: 112 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -5609,27 +6733,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 7 + NumLoadsA: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -5637,65 +6767,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true - UnrollMajorLDSA: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -5704,26 +6853,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5731,40 +6888,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: -1 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -5779,16 +6940,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -5796,27 +6958,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -5824,93 +6992,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1BO15GwoaHoLQS-MW80HA812QeAU4BkTYnjNreTV3hbA= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5918,10 +7114,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -5930,28 +7128,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60928 LdsInitCVgprs: false - LdsNumBytes: 44032 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 44032 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -5966,16 +7166,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 64 - MacroTileA: 256 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -5983,27 +7184,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -6011,65 +7218,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -6078,25 +7304,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -6105,40 +7339,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 20992 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20992 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -6153,16 +7391,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -6170,27 +7409,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -6198,65 +7443,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -6265,26 +7529,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6292,40 +7564,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT14_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 59136 - LdsNumElementsAlignedB: 4352 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 59136 - LdsOffsetB_Blk: 124672 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 124672 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: -1 - LocalReadVectorWidth: 4 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -6341,15 +7617,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [14, 1] - MIWaveTileA: 14 - MIWaveTileB: 1 + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 448 - MacroTile1: 32 - MacroTileA: 448 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -6357,27 +7634,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 28 - NumLoadsB: 2 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 28 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -6385,93 +7668,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 33 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT448x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT14_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 56 - ThreadTile1: 1 - ThreadTileA: 56 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6YfHPpW5nrJ_KoDtfgCdPidYL7FGc1oKcZLotw2dMqNA= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 6 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6479,45 +7790,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -6527,16 +7842,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -6544,27 +7860,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -6572,66 +7894,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 34 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU6_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 6] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -6640,26 +7980,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6667,45 +8015,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -6715,16 +8067,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -6732,27 +8085,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -6760,65 +8119,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 35 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -6827,25 +8205,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -6854,45 +8240,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 37888 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37888 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -6903,15 +8293,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] + MIWaveTile: [4, 3] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -6919,27 +8310,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -6947,36 +8344,45 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 36 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 4 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -6986,26 +8392,36 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -7014,26 +8430,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7041,45 +8465,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 19456 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 19456 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -7089,16 +8517,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 112 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 112 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -7106,27 +8535,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 7 + NumLoadsA: 4 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -7134,92 +8569,120 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 37 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 1024 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 3 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true - AssertFree0ElementMultiple: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3oR9yCzFLBkB-A3rS4fIh8cL6BZiueTHUF10AEMc6Cao= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 9 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -7228,45 +8691,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 28672 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 16 + LoopUnroll: 256 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -7276,16 +8743,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -7293,27 +8761,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -7321,36 +8795,45 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 38 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU9_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 1 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -7360,26 +8843,36 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 9] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -7388,25 +8881,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -7415,45 +8916,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60928 LdsInitCVgprs: false - LdsNumBytes: 19456 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 19456 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -7463,16 +8968,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -7480,27 +8986,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -7508,65 +9020,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 39 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 - StaggerUStride: 1024 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 24 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 24 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 3 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -7575,26 +9106,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7602,45 +9141,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 53248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53248 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 91136 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -7650,16 +9193,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 5] + MIWaveTileA: 5 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 160 + MacroTile1: 160 + MacroTileA: 160 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -7667,27 +9211,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 100 + NumGlobalWriteVectorsPerThread: 100 + NumLoadsA: 5 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -7695,93 +9245,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 40 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 20 + ThreadTile1: 5 + ThreadTileA: 20 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT30pNHTzkaDtl1V6kOLGqErWay80kGtYggMafN3flPYWk= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7789,45 +9367,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27136 LdsInitCVgprs: false LdsNumBytes: 27136 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 10240 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 27136 - LdsOffsetMetadata_Blk: 49664 + LdsOffsetMetadata_Blk: 41472 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -7838,15 +9420,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [8, 1] - MIWaveTileA: 8 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -7854,27 +9437,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -7882,65 +9471,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 41 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -7949,26 +9557,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7976,10 +9592,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -7988,28 +9606,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 46080 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 91136 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -8024,16 +9644,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -8041,27 +9662,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -8069,93 +9696,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 42 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3Zgh4uZH3XwYCLwmVcVtjIZnx6vFX4GmUTCIqzluGGTY= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8163,40 +9818,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -8211,9 +9870,9 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 @@ -8221,6 +9880,7 @@ MacroTileA: 32 MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -8228,27 +9888,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -8256,93 +9922,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 43 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT9g1aeT0kLmhZ_sWyxI4GeK3XcMOEdaqmXjhARoSfPEek= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 7 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8350,45 +10044,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 LdsInitCVgprs: false - LdsNumBytes: 48640 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 48640 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 30720 + LdsOffsetMetadata_Blk: 48128 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -8398,16 +10096,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -8415,27 +10114,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -8443,93 +10148,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 44 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU7_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 12 ThreadTile1: 3 - ThreadTileA: 8 + ThreadTileA: 12 ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 7] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3q5MPFCjk8cIJxeg__BJLgMDfRuxj9turlgrS4jDIH-8= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 7 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8537,10 +10270,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -8549,28 +10284,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 37888 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37888 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -8585,16 +10322,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -8602,27 +10340,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -8630,93 +10374,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 45 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU7_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 7] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6mXuspiI9d9-5RQhZJXUheCs3sjktrc-CvyvsUkMj0Ko= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 9 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8724,10 +10496,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -8736,28 +10510,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27136 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 27136 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 28672 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 41472 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -8772,44 +10548,51 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 MatrixInstK: 16 MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -8817,65 +10600,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 46 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU9_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 9] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -8884,26 +10686,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8911,10 +10721,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -8923,28 +10735,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 19456 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 19456 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -8960,15 +10774,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -8976,27 +10791,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -9004,92 +10825,120 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 47 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6aauTeStg0aSmvo_QbeW1eL-kGfWnBPAok3n73qVy0mQ= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -9098,45 +10947,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -9156,6 +11009,7 @@ MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -9163,27 +11017,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -9191,25 +11051,30 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 48 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 ThreadTile1: 1 @@ -9217,10 +11082,14 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -9230,54 +11099,73 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64VVECYIqn6ZzoQ9SPCWtOtxZ3-nd7tozgJn-JGHW9Tc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9285,10 +11173,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -9297,28 +11187,30 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 35840 - LdsNumElementsAlignedA: 17408 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35840 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -9333,16 +11225,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -9350,20 +11243,25 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -9371,6 +11269,7 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -9378,93 +11277,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 49 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT9HGTExvgGLNUUrb40FVK3RqyK4GC4m3UmQpATDuBwoBY= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9472,10 +11399,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -9484,28 +11413,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 55296 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 46592 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55296 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 46592 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -9520,16 +11451,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 96 MacroTile1: 128 - MacroTileA: 256 + MacroTileA: 96 MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -9537,27 +11469,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -9565,93 +11503,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 50 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6Qmx9VdTVz_Y5ax1LYPo-0dVO4zW6cL4X2x3fWybxfhk= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9659,10 +11625,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -9671,28 +11639,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 LdsInitCVgprs: false - LdsNumBytes: 27648 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -9708,15 +11678,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -9724,27 +11695,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -9752,65 +11729,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 51 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -9819,26 +11815,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 7 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9846,10 +11850,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -9858,28 +11864,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 35840 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 83968 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 83968 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -9894,16 +11902,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 224 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -9911,27 +11920,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 NumLoadsA: 4 - NumLoadsB: 3 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -9939,93 +11954,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 52 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU7_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 7] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6VKOvwZI9OdyDqa8R6ZplYgmun7qlPSAW5bdf-sEu6Yw= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10033,40 +12076,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 31232 LdsInitCVgprs: false - LdsNumBytes: 37376 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 31232 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37376 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 31232 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -10081,16 +12128,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -10098,27 +12146,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -10126,93 +12180,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 53 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 - StaggerUStride: 512 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3wTwGBgkBTL7ZfQIiBTrtIQlJHRTrSp-qjCgptPT7Aa4= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10220,45 +12302,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 37888 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37888 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -10269,15 +12355,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -10285,27 +12372,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -10313,92 +12406,120 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 54 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 3 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3OQm6OVrDzPtaX_AHLYnzKIpm9pmBS-lHC57KJsq_NoU= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -10407,45 +12528,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -10455,16 +12580,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -10472,27 +12598,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -10500,65 +12632,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 55 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 8 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -10567,25 +12718,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 5 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -10594,10 +12753,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -10609,25 +12770,27 @@ LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 37888 + LdsNumBytes: 32768 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37888 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -10643,15 +12806,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] + MIWaveTile: [4, 3] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -10659,27 +12823,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -10687,36 +12857,45 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 56 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU5_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 4 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -10726,53 +12905,72 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 5] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3OkhJ3w8xamz6Pe7QJHpBed4sfIuniTYoHp4EKL_XkMs= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -10781,45 +12979,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 27648 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 16 + LoopUnroll: 256 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -10829,16 +13031,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -10846,27 +13049,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -10874,36 +13083,45 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 57 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -10913,54 +13131,73 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT67nzdVgNPSHAFfaUVWPf8AHyV87qZfdLXSxTt8PdUT0U= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10968,45 +13205,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 28672 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -11016,16 +13257,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -11033,27 +13275,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11061,65 +13309,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 58 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 1024 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 3 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -11128,26 +13395,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11155,10 +13430,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -11167,28 +13444,30 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 35840 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35840 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -11204,15 +13483,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -11220,27 +13500,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11248,93 +13534,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 59 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT9VeB2l26tmID86Z-eb-0Ldt0BFQh-2VDVmGDWFAzmG5Y= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11342,45 +13656,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 53248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53248 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 48128 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -11390,16 +13708,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -11407,27 +13726,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11435,93 +13760,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 60 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 12 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 12 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1BAuwKtaK790HSQyCfzAeNUneQbsExjtZRoskdUyeOOw= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11529,40 +13882,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27136 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 27136 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: -1 - LocalReadVectorWidth: 4 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -11577,16 +13934,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -11594,27 +13952,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11622,93 +13986,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 61 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1PDJ-HctGau6vTDCGJC3_7O9axp4HkeilJIhBEH6fEsM= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11716,45 +14108,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -11764,16 +14160,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -11781,27 +14178,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11809,93 +14212,121 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 62 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT39I9YOvyVhA-YadZQj4JiBAyNZ5L84t-_3ZJGhmgX2aE= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 - ForceDisableShadowInit: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11903,45 +14334,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 29184 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 - LdsPadMetadata: -1 + LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 16 + LoopUnroll: 256 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -11952,15 +14387,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -11968,27 +14404,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11996,65 +14438,84 @@ PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 63 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -12063,27 +14524,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12091,45 +14559,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -12139,16 +14611,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -12156,93 +14629,118 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 64 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 - SourceSwap: 0 - StaggerU: 8 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -12251,27 +14749,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12279,45 +14784,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -12327,16 +14836,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -12344,123 +14854,155 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 8 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 65 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1 - SourceSwap: 0 - StaggerU: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1LCsdpT6oKM2TBprDV1rbxZccBQ6gdX95FqAxQt5iXgM= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12468,45 +15010,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -12516,16 +15062,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -12533,122 +15080,155 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 66 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1 - SourceSwap: 0 - StaggerU: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT45BfCSjBoFp38CVG7yTN_yKslEK0pBNsbzLpJZAfP724= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12656,10 +15236,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA7_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -12668,28 +15250,30 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 48640 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 46592 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 48640 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 46592 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -12704,16 +15288,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -12721,122 +15306,155 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 12 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 + NumLoadsA: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 67 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU8_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA7_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 8] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3OkoGuW4YHifuD6FQCEDWDAISQT90sjhwvmiFONCnaVI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12844,45 +15462,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 52224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 52224 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -12892,16 +15514,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -12909,122 +15532,155 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 68 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU8_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 8] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6OdQx4Lahg-zzlcNDqF_q5uagn-Xz-DPj5Lfdjzb6jEU= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13032,10 +15688,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -13044,33 +15702,35 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 62464 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 62464 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -13080,16 +15740,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -13097,93 +15758,118 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 69 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU8_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 8] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -13192,27 +15878,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13220,45 +15913,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -13268,16 +15965,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -13285,123 +15983,155 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 64 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 70 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU8_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM0_WGMXCC1 - SourceSwap: 0 - StaggerU: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 8] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6HbA1UcGDvRDLfMEBisSOLkFp-js5IeIZGShfJ2tpN6A= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13409,45 +16139,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 256 LSCB: 256 - LSPA: 4 - LSPB: 4 + LSPA: 8 + LSPB: 8 LVCA: 32 LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 42496 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 42496 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 59392 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -13457,16 +16191,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 48 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -13474,80 +16209,96 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 71 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU8_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1 - SourceSwap: 0 - StaggerU: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 8] + WorkspaceCheck: [4, 0, -1] _DepthU: 256 _DepthUA: 256 _DepthUB: 256 @@ -13558,10 +16309,18 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -13570,27 +16329,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13598,45 +16364,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -13646,16 +16416,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -13663,93 +16434,118 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 72 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU8_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 8] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -13758,27 +16554,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13786,45 +16589,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 35840 - LdsNumElementsAlignedA: 17408 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35840 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -13834,16 +16641,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] + MIWaveTileA: 8 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -13851,19 +16659,24 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 4 @@ -13872,72 +16685,92 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 73 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU8_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 8] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -13946,27 +16779,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13974,25 +16814,28 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 48640 + LdsNumBytes: 51712 LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -14001,18 +16844,19 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 48640 + LdsOffsetMetadata: 51712 LdsOffsetMetadata_Blk: 100352 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -14023,15 +16867,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -14039,93 +16884,118 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularB: 4 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 74 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU8_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 8] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -14134,27 +17004,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14162,45 +17039,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -14210,16 +17091,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -14227,94 +17109,118 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 75 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 1024 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -14323,27 +17229,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14351,45 +17264,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x112x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17920 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 17920 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 17920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -14399,16 +17316,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 112 + MacroTileA: 256 + MacroTileB: 112 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -14416,122 +17334,154 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 76 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1 - SourceSwap: 0 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 1024 - StorePriorityOpt: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x112x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1en3qwEwmF0pe8TKH4Ay3SLC0poKZHE6_Bt4ZbERhXZs= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -14540,14 +17490,16 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 512 LSCB: 512 - LSPA: 2 - LSPB: 2 + LSPA: 4 + LSPB: 4 LVCA: 64 LVCB: 64 LVPA: 1 @@ -14555,30 +17507,32 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -14588,16 +17542,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -14605,51 +17560,61 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 77 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1 - SourceSwap: 0 - StaggerU: 8 - StaggerUMapping: 1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 1024 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 @@ -14660,10 +17625,14 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -14673,12 +17642,14 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 512 _DepthUA: 512 _DepthUB: 512 @@ -14689,39 +17660,55 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6B_V_BTIBS0uoY1Y_pFReyxbrzcG822ShQaZfJj3RafI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14729,45 +17716,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27136 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 27136 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 41472 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -14777,16 +17768,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -14794,123 +17786,155 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 78 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1izoOxhf92A4L_LG-wrMQBmb6Q9exFMeVpyyuCCoViIM= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14918,10 +17942,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 8 @@ -14930,28 +17956,30 @@ LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -14966,16 +17994,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -14983,80 +18012,96 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 79 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 256 _DepthUA: 256 _DepthUB: 256 @@ -15067,10 +18112,18 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -15079,26 +18132,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -15107,10 +18167,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -15120,27 +18182,29 @@ LVPA: 2 LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -15155,16 +18219,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -15172,80 +18237,96 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 80 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 @@ -15256,10 +18337,18 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -15268,27 +18357,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15296,10 +18392,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -15308,33 +18406,35 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 LdsInitCVgprs: false - LdsNumBytes: 48640 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 48640 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -15344,16 +18444,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -15361,80 +18462,96 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 81 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 16 ThreadTile1: 3 - ThreadTileA: 8 + ThreadTileA: 16 ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 @@ -15445,39 +18562,55 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1tFc4L62UtE31KWtUmzfM_rVf_Ic0PstsBB9_Ws5j8qc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15485,21 +18618,24 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false LdsNumBytes: 51200 LdsNumElementsAlignedA: 33792 @@ -15519,11 +18655,12 @@ LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -15533,16 +18670,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -15550,94 +18688,118 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 82 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -15646,27 +18808,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15674,45 +18843,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -15722,16 +18895,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -15739,123 +18913,155 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 83 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 - SourceSwap: 0 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 1024 - StorePriorityOpt: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6QMO2oG6Zs8jRCpzsHLvNlZOPvSuSWEOxKoV_uSu4ssY= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15863,45 +19069,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -15911,16 +19121,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -15928,122 +19139,154 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 84 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 - SourceSwap: 0 - StaggerU: 8 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT4kJyY7kfbZJm6OjNDKtfmbP0rmCZtsMgycYFd9Lz6KAg= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -16052,45 +19295,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 73216 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 73216 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -16100,16 +19347,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 48 + MacroTile1: 192 + MacroTileA: 48 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -16117,65 +19365,79 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 85 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 - SourceSwap: 0 - StaggerU: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -16185,54 +19447,72 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT9rLJSjxTLFyk6rE5ZrBbLYo4K_rPheyG7SPdGi96yt-I= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -16241,45 +19521,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -16289,16 +19573,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -16306,65 +19591,79 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 86 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 - SourceSwap: 0 - StaggerU: 8 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -16374,26 +19673,36 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -16402,27 +19711,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16430,45 +19746,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -16478,16 +19798,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -16495,94 +19816,118 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 0 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 87 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1 - SourceSwap: 0 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -16591,26 +19936,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -16619,40 +19971,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 128 LSCB: 128 - LSPA: 8 - LSPB: 8 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60928 LdsInitCVgprs: false - LdsNumBytes: 64000 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -16667,16 +20023,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -16684,93 +20041,118 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 128 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 PrefetchLocalRead: 1 - PreloadKernArgs: 1 + PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 88 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1 - SourceSwap: 0 - StaggerU: 8 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -16779,26 +20161,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 6 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16806,46 +20196,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25216 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 25216 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 41088 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 25216 + LdsOffsetMetadata_Blk: 41088 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -16855,16 +20248,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -16872,27 +20266,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 - NumThreads: 64 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -16905,89 +20305,115 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 89 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU6_GSUAMBSK_GLS0_ISA942_IU1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM8_WGMXCC1 - SourceSwap: 0 - StaggerU: 8 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 6] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1CjDeuU1TGJDlkCLbl3m2EFD02HesvlcFsgCorT7UcMc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -16996,45 +20422,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -17044,16 +20474,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [1, 4] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -17061,27 +20492,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -17094,20 +20531,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 90 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 ThreadTile1: 1 @@ -17115,10 +20557,14 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -17128,55 +20574,73 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6eQZ2BclKSIt7Pd63Lt1oP0sV5ptCXXnWcj18stiAQnc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17184,45 +20648,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 LdsInitCVgprs: false - LdsNumBytes: 58880 + LdsNumBytes: 45056 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -17232,16 +20700,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -17249,27 +20718,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -17282,88 +20757,115 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 91 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6RtR09yT8ajujnmdqNrDFPiOIqGlncLwGLyVGziEeOMU= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 5 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -17372,10 +20874,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -17387,30 +20891,32 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 52224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 52224 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -17420,16 +20926,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [4, 7] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 112 + MacroTileA: 64 + MacroTileB: 112 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -17437,27 +20944,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 7 + NumLoadsA: 4 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -17470,31 +20983,40 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 92 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU5_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 7 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -17504,55 +21026,73 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 5] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6gPZxIGIVbEgykk65czZrGO3xeN-NSYQtxBSVpvjQ7mk= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 5 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17560,10 +21100,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -17572,28 +21114,30 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 62464 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 62464 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -17609,15 +21153,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -17625,27 +21170,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -17658,88 +21209,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 93 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU5_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 5] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1wlPOMwfJQM5srBalO5DZkcm6U4X2FO028oQoNgltA8Y= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17747,25 +21326,28 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 51200 + LdsNumBytes: 61440 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -17774,13 +21356,14 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 + LdsOffsetMetadata: 61440 LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 2 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -17795,16 +21378,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -17812,27 +21396,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -17845,89 +21435,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 94 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 512 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3hjm-EOWVOW-cZPmGlwg4ylcIRl60DNUJp-5hVZ8I0ww= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17935,45 +21552,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 256 LSCB: 256 - LSPA: 2 - LSPB: 2 + LSPA: 8 + LSPB: 8 LVCA: 32 LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -17984,15 +21605,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -18000,27 +21622,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -18033,90 +21661,115 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 95 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true - UnrollLoopSwapGlobalReadOrder: 0 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 256 _DepthUA: 256 _DepthUB: 256 _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT87GULbMh-pIm1O84Zam8m0fIzGd76cmyOVKCexMro4-k= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -18125,45 +21778,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 88576 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -18173,16 +21830,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 80 + MacroTile1: 128 + MacroTileA: 80 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -18190,27 +21848,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 5 NumLoadsPerpendicularB: 8 - NumThreads: 64 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -18223,62 +21887,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 96 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -18287,27 +21968,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 512 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18315,45 +22003,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LSCA: 512 - LSCB: 512 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 32 - LoopUnroll: 512 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -18363,16 +22055,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -18380,27 +22073,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 16 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -18413,91 +22112,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 97 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM0_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true - UnrollLoopSwapGlobalReadOrder: 0 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6nQ-vdmI9Ye155heVQJZyA_zECvR_hEOnSldEz9oZyYE= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18505,45 +22229,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -18553,16 +22281,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -18570,27 +22299,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 - NumThreads: 64 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -18603,62 +22338,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 98 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM0_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -18667,26 +22419,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -18695,10 +22454,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -18710,10 +22471,11 @@ LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 22528 + LdsNumBytes: 32768 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -18722,13 +22484,14 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 22528 + LdsOffsetMetadata: 32768 LdsOffsetMetadata_Blk: 50176 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -18744,15 +22507,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 1] + MIWaveTile: [4, 3] MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -18760,27 +22524,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -18793,32 +22563,40 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 99 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -18828,27 +22606,36 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -18857,26 +22644,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -18885,45 +22679,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60928 LdsInitCVgprs: false - LdsNumBytes: 10624 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 2176 + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10624 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -18933,16 +22731,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -18950,27 +22749,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -18983,91 +22788,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 100 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1diKeSgAQ9WnpgE7yXWYM0aDLuPPSMgnFJJEzXtTSG_o= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19075,45 +22905,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -19123,16 +22957,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -19140,27 +22975,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -19173,90 +23014,115 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 101 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM0_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT9i-37dkhz38zG_IGCg71Jvx6yXuta5n7GNOpIWVrZURc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -19265,45 +23131,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30720 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 30720 + LdsOffsetMetadata_Blk: 48128 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -19313,16 +23183,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -19330,27 +23201,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -19363,32 +23240,40 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 102 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM0_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - SubGroup0: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -19398,56 +23283,73 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 0 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1ZbV_we61dHb-7MaSo5V0WsygAmTrbCXJlq_91MCaYgk= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19455,45 +23357,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -19503,16 +23409,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -19520,27 +23427,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -19553,90 +23466,115 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 103 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true - UnrollLoopSwapGlobalReadOrder: 0 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT19xYcMmdMV0cprLqCTqlkMexKqqIxc56ZfY1qhwmpRcY= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -19645,45 +23583,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -19693,16 +23635,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [1, 4] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -19710,27 +23653,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -19743,20 +23692,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 104 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 ThreadTile1: 1 @@ -19764,11 +23718,14 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true - UnrollLoopSwapGlobalReadOrder: 0 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -19778,56 +23735,73 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6XzsrgajiojbvtFstfLt7imgW3dhnW8-xSZB2cbcnaLI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19835,45 +23809,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -19883,16 +23861,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -19900,27 +23879,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -19933,91 +23918,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 105 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 - ActivationAlt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1Nm7CTnP4vCeM9IY0naeP24CoXDvkzDuwtSenoKaBL9g= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20025,40 +24035,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23680 LdsInitCVgprs: false - LdsNumBytes: 3328 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 2176 + LdsNumBytes: 23680 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 3328 - LdsOffsetMetadata_Blk: 5248 + LdsOffsetMetadata: 23680 + LdsOffsetMetadata_Blk: 41216 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -20073,16 +24087,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -20090,27 +24105,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -20123,90 +24144,115 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 106 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 64 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true - UnrollLoopSwapGlobalReadOrder: 0 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT8Tx6OV6SS13girzsGf3ltEbbMPC3VNY_MdVaqPd4SPlk= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -20215,40 +24261,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 31232 LdsInitCVgprs: false - LdsNumBytes: 10240 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 + LdsNumBytes: 31232 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 45568 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 31232 + LdsOffsetMetadata_Blk: 45568 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -20263,16 +24313,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 80 + MacroTile1: 128 + MacroTileA: 80 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -20280,27 +24331,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -20313,91 +24370,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 107 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6WICaKMSg_J0qmh07Iyv5FwzPcx96ulWOEF3l24yvSic= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20405,45 +24487,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 7680 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 7680 - LdsOffsetMetadata_Blk: 10752 + LdsOffsetMetadata: 39936 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -20453,16 +24539,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 80 + MacroTileA: 64 + MacroTileB: 80 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -20470,27 +24557,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 5 + NumLoadsA: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -20503,90 +24596,115 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 108 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1INOkhbGKAAilWSRensj7VTtw4co501aQSbqiXTCyhAU= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -20595,45 +24713,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LSCA: 128 - LSCB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 + LVCA: 32 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 LdsInitCVgprs: false - LdsNumBytes: 13824 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13824 - LdsOffsetMetadata_Blk: 20992 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -20643,16 +24765,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -20660,27 +24783,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -20693,20 +24822,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 109 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 ThreadTile1: 1 @@ -20714,11 +24848,14 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true - UnrollLoopSwapGlobalReadOrder: 0 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -20728,56 +24865,73 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1fbfdryne5326jQiClBJLLBnDIlq20lK5C78o6RlcPzQ= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20785,40 +24939,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 11776 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 11776 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -20833,16 +24991,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -20850,27 +25009,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -20883,62 +25048,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 110 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -20947,27 +25129,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20975,40 +25164,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 17920 - LdsNumElementsAlignedA: 7680 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 7680 - LdsOffsetB_Blk: 40448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17920 - LdsOffsetMetadata_Blk: 40448 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -21023,16 +25216,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 64 - MacroTileA: 48 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -21040,27 +25234,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -21073,62 +25273,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 111 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -21137,27 +25354,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21165,10 +25389,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -21177,28 +25403,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 27136 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27136 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -21213,16 +25441,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -21230,27 +25459,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -21263,91 +25498,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 112 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT9x_-1CoeaNnxLNQyAvqUfqaamG7WKONjZ27cI7FnYA8w= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21355,10 +25615,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -21367,28 +25629,30 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -21404,15 +25668,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 + MIWaveTile: [3, 3] + MIWaveTileA: 3 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 96 MacroTile1: 96 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -21420,27 +25685,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 6 NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -21453,62 +25724,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 113 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 12 ThreadTile1: 3 - ThreadTileA: 16 + ThreadTileA: 12 ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -21517,27 +25805,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21545,45 +25840,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 63744 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63744 - LdsOffsetMetadata_Blk: 98816 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -21594,15 +25893,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -21610,27 +25910,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 8 - NumLoadsB: 7 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -21643,62 +25949,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 114 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true - UnrollLoopSwapGlobalReadOrder: 0 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -21707,27 +26030,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21735,10 +26065,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -21747,28 +26079,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 34048 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34048 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 59392 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -21784,15 +26118,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -21800,27 +26135,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -21833,62 +26174,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 115 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -21897,27 +26255,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21925,45 +26290,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 LdsInitCVgprs: false - LdsNumBytes: 34048 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34048 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -21974,15 +26343,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTile: [6, 5] + MIWaveTileA: 6 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 192 + MacroTile1: 160 + MacroTileA: 192 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -21990,27 +26360,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 16 - NumLoadsB: 16 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -22023,91 +26399,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 116 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 24 + ThreadTile1: 5 + ThreadTileA: 24 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true - UnrollLoopSwapGlobalReadOrder: 0 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6IIZhUoHL_IJRC0XQPXusLlXveY2UtiTbaaczcCMLgBM= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22115,45 +26516,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 37376 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37376 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -22163,16 +26568,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 16 - MacroTileA: 256 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -22180,10 +26586,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -22191,16 +26601,18 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -22213,90 +26625,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 117 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1XYUQ3KlJ5tCEiBWpTBtLUlsfh8DH6n8kQfnHcKspwhg= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22304,47 +26742,51 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63488 LdsInitCVgprs: false - LdsNumBytes: 63744 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 35840 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63744 - LdsOffsetMetadata_Blk: 98816 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: true - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 @@ -22353,15 +26795,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 + MIWaveTile: [6, 7] + MIWaveTileA: 6 MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 192 MacroTile1: 224 - MacroTileA: 256 + MacroTileA: 192 MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -22369,27 +26812,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 32 - NumLoadsB: 28 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 6 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 28 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -22402,60 +26851,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 118 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 24 ThreadTile1: 7 - ThreadTileA: 32 + ThreadTileA: 24 ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -22464,26 +26932,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -22492,45 +26967,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -22540,16 +27019,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -22557,10 +27037,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -22568,16 +27052,18 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -22590,25 +27076,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 119 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22617,36 +27108,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -22655,27 +27157,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22683,10 +27192,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -22695,28 +27206,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 47616 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 35840 + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -22731,16 +27244,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 7] - MIWaveTileA: 6 - MIWaveTileB: 7 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 224 - MacroTileA: 192 - MacroTileB: 224 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -22748,27 +27262,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 168 - NumGlobalWriteVectorsPerThread: 84 - NumLoadsA: 6 - NumLoadsB: 7 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -22781,92 +27301,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 120 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 7 - ThreadTileA: 24 - ThreadTileB: 7 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6i5TKooAQCaKX8xr9szBCdevvk_f81umTcdf_ivG2Ylk= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22874,45 +27418,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 16896 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -22923,15 +27471,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -22939,20 +27488,25 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -22960,6 +27514,7 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -22972,63 +27527,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 121 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG32 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 32 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -23037,27 +27608,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23065,45 +27643,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 49408 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 49408 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -23114,15 +27696,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -23130,27 +27713,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -23163,25 +27752,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 122 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 8 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23190,36 +27784,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -23228,27 +27833,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23256,45 +27868,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA2_WSGRB0_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 91648 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -23304,16 +27920,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 2] + MIWaveTileA: 12 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -23321,27 +27938,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 96 NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsA: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -23354,25 +27977,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 123 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU8_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA2_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC8_WGMXCCG32 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23381,36 +28009,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 32 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 8] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -23419,27 +28058,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23447,40 +28093,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA2_WSGRB2_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 LdsInitCVgprs: false - LdsNumBytes: 38912 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 83968 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38912 - LdsOffsetMetadata_Blk: 83968 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -23495,16 +28145,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -23512,27 +28163,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 6 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -23545,63 +28202,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 124 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA2_WSGRB2_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 2 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -23610,27 +28283,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 7 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23638,45 +28318,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25088 LdsInitCVgprs: false - LdsNumBytes: 55808 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -23686,16 +28370,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -23703,27 +28388,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -23736,63 +28427,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 125 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU7_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 64 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 7] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -23801,27 +28508,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23829,45 +28543,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL0_UIOFGRO1_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 42496 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 42496 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -23877,16 +28595,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -23894,27 +28613,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -23927,25 +28652,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 126 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL0_UIOFGRO1_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM38_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23953,66 +28683,85 @@ UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true - Use64bShadowLimit: 0 - UseInstOffsetForGRO: 1 - UseSgprForGRO: 0 + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 38 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1rFSK2R4cPqBcW2t7WfbbdkJecrwL8eR6UpC423foPY8= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24020,45 +28769,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -24068,16 +28821,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -24085,27 +28839,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -24118,25 +28878,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 127 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24145,36 +28910,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -24183,27 +28959,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24211,45 +28994,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25216 LdsInitCVgprs: false - LdsNumBytes: 42496 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 25216 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 41088 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 42496 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 25216 + LdsOffsetMetadata_Blk: 41088 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -24259,16 +29046,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -24276,27 +29064,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -24309,92 +29103,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 128 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 64 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT4JPWqoKzFMyXv17CN5WS64-ufHDtSjZifT_9H7EtPBXI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24402,40 +29220,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 42496 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 42496 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 73216 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 42496 + LdsOffsetMetadata_Blk: 73216 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -24450,16 +29272,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 48 + MacroTile1: 256 + MacroTileA: 48 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -24467,27 +29290,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -24500,90 +29329,114 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 129 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM38_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 38 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false + AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false - CustomKernelName: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_K1_MIWT4_16_DTVA + CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -24592,10 +29445,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 0, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_K1_MIWT4_16_DTVA + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -24605,30 +29460,34 @@ LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 67072 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67072 - LdsOffsetMetadata_Blk: 164864 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -24638,16 +29497,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 16] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] MIWaveTileA: 4 - MIWaveTileB: 16 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -24655,124 +29515,155 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false - NoReject: true + NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 130 - SolutionNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_AS_SAV_UserArgs_MT256x256x64_MI16x16x1_SN_K1_MIWT4_16_DTVA - SourceSwap: false - StaggerU: 32 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 16 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: -1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT4wIjEL27vX5d7EsvtOKRtWYuscRcj5qwe3CisKP1gUEI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24780,45 +29671,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB0_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL0_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 46592 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 46592 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -24828,16 +29723,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -24845,27 +29741,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -24878,24 +29780,29 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 131 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB0_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL0_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT48x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 12 ThreadTile1: 1 - ThreadTileA: 8 + ThreadTileA: 12 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -24904,37 +29811,48 @@ UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true - Use64bShadowLimit: 0 + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -24943,27 +29861,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 2 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24971,40 +29896,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -25019,16 +29948,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 160 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -25036,27 +29966,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -25069,25 +30005,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 132 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU2_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG16_16_1_WGM0_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25096,65 +30037,84 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 2] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6JlvjtXTQ32OEjSMulxSMXR8zOyTjDwLkD-T-RrNVSnY= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25162,45 +30122,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -25211,15 +30175,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -25227,27 +30192,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -25260,25 +30231,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 133 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCG4 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25287,36 +30263,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 4 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -25325,27 +30312,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25353,10 +30347,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -25365,28 +30361,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 47616 LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 91648 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -25402,15 +30400,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -25418,27 +30417,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -25451,92 +30456,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 134 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6S1geq6L2u08b5nB4vFyAXINv0YS2akU3sESXltqMKBI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 2 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25544,40 +30573,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 + LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -25593,15 +30626,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -25609,27 +30643,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -25642,25 +30682,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 135 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU2_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM38_WGMXCC4_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25669,64 +30714,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 38 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 2] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT39IacBO3INVHNmhEoFR7uJcZcXsCL6uS74bfIgosJWXk= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 - DirectToLds: false + DebugStreamK: 0 + DepthU: 512 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -25735,45 +30799,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 LSPA: 4 LSPB: 4 - LVCA: 8 - LVCB: 8 + LVCA: 64 + LVCB: 64 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 21504 - LdsNumElementsAlignedA: 13056 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13056 - LdsOffsetB_Blk: 45824 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 21504 - LdsOffsetMetadata_Blk: 45824 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -25783,16 +30851,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 4] - MIWaveTileA: 6 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -25800,27 +30869,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -25833,25 +30908,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 136 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM304_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 4 - ThreadTileA: 24 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25860,64 +30940,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 304 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1wwwHa4bSivVmbymd6x5hI96nePwYOxTArtM0VLZaDn4= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -25926,25 +31025,28 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB2_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 - LSPB: 4 + LSPB: 32 LVCA: 8 LVCB: 8 LVPA: 4 - LVPB: 1 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 LdsInitCVgprs: false - LdsNumBytes: 51200 + LdsNumBytes: 53248 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedB: 35840 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -25953,13 +31055,14 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 + LdsOffsetMetadata: 53248 LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -25975,15 +31078,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 8] + MIWaveTile: [4, 7] MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveTileB: 7 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 256 + MacroTile1: 224 MacroTileA: 128 - MacroTileB: 256 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -25991,27 +31095,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -26024,92 +31134,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 137 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG32 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 8 + ThreadTile1: 7 ThreadTileA: 16 - ThreadTileB: 8 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 32 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT31PBd2Sd0QiAFe378WdZbJauPh9DSkCb4GOW2roNTc3A= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26117,40 +31251,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 91648 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49664 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -26165,16 +31303,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -26182,27 +31321,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -26215,92 +31360,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 138 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3WGUNFaPiFZEsQykz12nvvqJ20clOxDWvXN-rOKynqJI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 6 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26308,45 +31477,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB2_WS64_WG64_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 LVPB: 1 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 48640 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 48640 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 16 + LoopUnroll: 256 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -26356,16 +31529,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 6] - MIWaveTileA: 4 - MIWaveTileB: 6 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 96 - MacroTileA: 256 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -26373,27 +31547,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -26406,25 +31586,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 139 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU6_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB2_WS64_WG64_4_1_WGM0_WGMXCC32_WGMXCCG32 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 6 - ThreadTileA: 16 - ThreadTileB: 6 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26433,65 +31618,84 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 32 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 6] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1shqk8IA8kFz56wRSKTEn_qnIrAIB5ScjcVmMFE2Geo8= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 6 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26499,40 +31703,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x144x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -26547,16 +31755,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 4] - MIWaveTileA: 6 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [3, 9] + MIWaveTileA: 3 + MIWaveTileB: 9 MIWaveTileMetadata: 0 MacroTile0: 192 - MacroTile1: 128 + MacroTile1: 144 MacroTileA: 192 - MacroTileB: 128 + MacroTileB: 144 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -26564,27 +31773,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 108 + NumGlobalWriteVectorsPerThread: 108 NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsB: 9 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 9 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -26597,25 +31812,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 140 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU6_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x144x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 4 - ThreadTileA: 24 - ThreadTileB: 4 + ThreadTile0: 12 + ThreadTile1: 9 + ThreadTileA: 12 + ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26624,36 +31844,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 6] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -26662,27 +31893,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 6 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26690,10 +31928,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -26702,28 +31942,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 48640 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 48640 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -26738,16 +31980,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 6] - MIWaveTileA: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 MIWaveTileB: 6 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 96 + MacroTile1: 192 MacroTileA: 256 - MacroTileB: 96 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -26755,27 +31998,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 + NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 24 NumLoadsA: 8 - NumLoadsB: 3 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -26788,24 +32037,29 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 141 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU6_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCG4 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 6 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true @@ -26815,36 +32069,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 8 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 6] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -26853,27 +32118,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 6 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26881,40 +32153,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB2_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -26929,16 +32205,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 4] - MIWaveTileA: 6 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 192 + MacroTileA: 64 MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -26946,27 +32223,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -26979,24 +32262,29 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 142 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU6_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 + ThreadTile0: 16 ThreadTile1: 4 - ThreadTileA: 24 + ThreadTileA: 16 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -27006,64 +32294,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 6] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3AYenl1vy7KIoQbqtVTAUkiATkIlrPR86G34YXYEoLaE= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 6 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -27072,40 +32379,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB2_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x80x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60416 LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 60416 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 43520 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 60416 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -27120,16 +32431,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 4] - MIWaveTileA: 6 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 80 + MacroTileA: 32 + MacroTileB: 80 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -27137,27 +32449,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 4 + NumElementsPerThread: 10 + NumGlobalWriteVectorsPerThread: 5 + NumLoadsA: 4 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 10 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -27170,63 +32488,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 143 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU6_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x80x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 4 - ThreadTileA: 24 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 6] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -27235,27 +32569,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 6 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -27263,45 +32604,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true, + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x144x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA4_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41472 LdsInitCVgprs: false - LdsNumBytes: 48640 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 41472 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 41472 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 0 + LdsOffsetB_Blk: 65536 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 48640 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 + LdsOffsetMetadata: 41472 + LdsOffsetMetadata_Blk: 65536 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -27312,15 +32657,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [4, 6] - MIWaveTileA: 4 - MIWaveTileB: 6 + MIWaveTile: [2, 9] + MIWaveTileA: 2 + MIWaveTileB: 9 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 96 - MacroTileA: 256 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 144 + MacroTileA: 128 + MacroTileB: 144 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -27328,27 +32674,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 + NumLoadsB: 9 + NumLoadsCoalescedA: 4 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 9 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -27361,25 +32713,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 144 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU6_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x144x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA4_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 6 - ThreadTileA: 16 - ThreadTileB: 6 + ThreadTile0: 8 + ThreadTile1: 9 + ThreadTileA: 8 + ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27388,32 +32745,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 6] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -27425,11 +32791,13 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3sNlWh8L24cllkMB7inyMed274xTQ3V4Kb2J6XXTlB4M= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -27437,16 +32805,20 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true @@ -27462,8 +32834,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -27473,28 +32845,29 @@ LVPA: 2 LVPB: 2 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 64000 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 64000 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 64000 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -27509,16 +32882,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 192 + MacroTileA: 32 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -27526,28 +32900,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -27560,29 +32939,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 145 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x192x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27591,22 +32971,25 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 @@ -27617,6 +33000,12 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -27628,32 +33017,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT15k41xBCNr7ZTOeUW6R9VVehKSsybnv0VocWP1ZKaKxU= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -27665,44 +33060,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 17408 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -27712,16 +33108,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -27729,10 +33126,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -27742,15 +33143,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -27763,63 +33165,73 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 146 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -27834,8 +33246,9 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -27843,20 +33256,24 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -27868,8 +33285,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -27878,29 +33295,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18432 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 18432 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -27916,15 +33334,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -27932,28 +33351,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -27966,15 +33390,16 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 147 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -27985,10 +33410,10 @@ SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27997,22 +33422,25 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 @@ -28023,6 +33451,12 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -28037,29 +33471,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28071,44 +33510,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x320x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 21760 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 82176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 82176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -28119,15 +33559,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [8, 10] + MIWaveTileA: 8 + MIWaveTileB: 10 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -28135,10 +33576,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -28148,15 +33593,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -28169,15 +33615,16 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 148 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x320x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -28188,44 +33635,53 @@ SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 10 + ThreadTileA: 32 + ThreadTileB: 10 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 8 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -28240,29 +33696,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 2 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28274,44 +33735,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -28321,16 +33783,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -28338,28 +33801,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 6 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -28372,63 +33840,73 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 149 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU2_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 512 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 2] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -28443,25 +33921,30 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true @@ -28477,39 +33960,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27136 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 27136 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27136 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -28524,16 +34008,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] + MIWaveGroup: [2, 1] + MIWaveTile: [4, 6] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -28541,10 +34026,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -28554,15 +34043,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -28575,37 +34065,40 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 150 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 6 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -28615,23 +34108,30 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -28646,25 +34146,30 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true @@ -28680,44 +34185,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -28727,16 +34233,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -28744,10 +34251,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -28757,15 +34268,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -28778,63 +34290,73 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 151 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -28846,32 +34368,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT63Jsyc19BiT8ixapz2U1jiZeH35fKEZv33zEvrjsx6fI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28883,44 +34411,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 24576 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -28931,15 +34460,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -28947,28 +34477,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -28981,15 +34516,16 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 152 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -29000,10 +34536,10 @@ SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29012,32 +34548,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -29049,11 +34594,13 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6qBX_6L-KZbW8HiGJUYEsC0hpVgBufhV-Zx_9cHXzQoI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -29061,20 +34608,24 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 2 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29086,8 +34637,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -29096,34 +34647,35 @@ LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 34304 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -29133,16 +34685,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -29150,28 +34703,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -29184,53 +34742,57 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 153 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU2_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 + StaggerUMapping: 0 StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 2] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 @@ -29241,6 +34803,12 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -29255,29 +34823,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29289,24 +34862,24 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 51200 + LdsNumBytes: 59392 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -29315,18 +34888,19 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 + LdsOffsetMetadata: 59392 LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -29337,15 +34911,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -29353,28 +34928,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -29387,15 +34967,16 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 154 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -29406,45 +34987,54 @@ SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -29458,29 +35048,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29492,44 +35087,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30720 + LdsBytesNoAmax: 23680 LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 5120 + LdsNumBytes: 23680 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 23680 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -29540,15 +35136,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -29556,10 +35153,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -29569,15 +35170,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -29590,15 +35192,16 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 155 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -29609,10 +35212,10 @@ SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29621,32 +35224,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -29658,32 +35270,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1wA0N1lxZFhLMeS9Q6iUSlvrcq-fSVqohTlzLTPllK2o= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29695,44 +35313,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 + LdsBytesNoAmax: 25088 LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -29742,16 +35361,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 256 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -29759,10 +35379,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -29772,15 +35396,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -29793,64 +35418,74 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 156 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -29864,29 +35499,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29898,44 +35538,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 + LdsBytesNoAmax: 65024 LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 + LdsOffsetMetadata: 0 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -29945,16 +35586,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -29962,28 +35604,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -29996,63 +35643,73 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 157 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU4_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 4] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -30064,28 +35721,34 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1xB12UW41szDey5KbR-koVeyqF6D9zrYMnbFyg2cG0Zc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true @@ -30101,44 +35764,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 91136 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -30148,16 +35812,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -30165,10 +35830,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -30177,16 +35846,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -30199,29 +35869,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 158 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -30230,32 +35901,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -30270,29 +35950,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30304,44 +35989,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 88576 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -30351,16 +36037,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -30368,28 +36055,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -30402,63 +36094,73 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 159 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -30470,28 +36172,34 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6fynZLzW5Cr8eufvbxTPYhFkdOgpOoTYptADbfQWQbo8= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 3 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true @@ -30507,44 +36215,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -30554,16 +36263,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [4, 2] MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -30571,21 +36281,25 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -30593,6 +36307,7 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -30605,22 +36320,23 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 160 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU3_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] @@ -30636,6 +36352,8 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -30645,23 +36363,30 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 3] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -30676,26 +36401,31 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 @@ -30710,44 +36440,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -30758,15 +36489,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] + MIWaveTile: [8, 6] MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTileB: 6 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 256 + MacroTile1: 192 MacroTileA: 256 - MacroTileB: 256 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -30774,10 +36506,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -30787,15 +36523,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -30808,13 +36545,14 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 161 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 64 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 @@ -30825,45 +36563,55 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 32 - ThreadTileB: 8 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -30875,32 +36623,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1tvg6nMEfnbvV_3zIdg6KTauFo8TF2jXSIB5PQIee6vc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 8 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30912,44 +36666,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -30959,16 +36714,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -30976,10 +36732,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -30989,15 +36749,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -31010,62 +36771,73 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 162 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 64 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -31080,26 +36852,31 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 @@ -31114,44 +36891,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x352x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 40576 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 40576 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 23936 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 82176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 40576 + LdsOffsetMetadata_Blk: 82176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -31162,15 +36940,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] + MIWaveTile: [8, 11] MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveTileB: 11 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 352 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 352 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -31178,10 +36957,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -31191,15 +36974,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 352 + NumGlobalWriteVectorsPerThread: 44 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 11 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 11 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -31212,13 +36996,14 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 163 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x352x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 @@ -31229,45 +37014,55 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 11 ThreadTileA: 32 - ThreadTileB: 4 + ThreadTileB: 11 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -31279,11 +37074,13 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3QgxLW-e3x8LxQgNKbghO9oAbcxnIx4uS3G_GHD3Uv_o= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -31291,20 +37088,24 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 8 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -31316,8 +37117,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 32 @@ -31326,29 +37127,30 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 + LdsBytesNoAmax: 41984 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 16640 + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 25344 LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 25344 + LdsOffsetB_Blk: 90880 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 82176 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 90880 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -31364,15 +37166,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 + MIWaveTile: [12, 8] + MIWaveTileA: 12 MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 384 MacroTile1: 256 - MacroTileA: 256 + MacroTileA: 384 MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -31380,10 +37183,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -31392,16 +37199,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 384 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 12 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 12 NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -31414,15 +37222,16 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 164 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -31431,79 +37240,98 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 48 ThreadTile1: 8 - ThreadTileA: 32 + ThreadTileA: 48 ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 4 VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2wdw6YWzv5OyKthMSpDGdGDbuM8m82xztD542UUs5j4c= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 3 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -31515,44 +37343,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30720 + LdsBytesNoAmax: 25088 LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 5120 + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -31563,15 +37392,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -31579,27 +37409,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -31612,15 +37448,19 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 165 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU3_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -31628,48 +37468,59 @@ SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 3] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -31678,27 +37529,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -31711,44 +37568,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 58880 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 + LdsOffsetMetadata: 0 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -31758,16 +37616,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -31775,27 +37634,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -31808,64 +37673,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 166 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -31874,28 +37754,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 3 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -31907,39 +37793,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - L1CacheSwizzle: false + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 15872 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -31954,16 +37841,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -31971,27 +37859,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -32004,50 +37898,57 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 167 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU3_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 3] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 @@ -32058,10 +37959,18 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -32070,28 +37979,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 3 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32103,44 +38018,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30720 + LdsBytesNoAmax: 41984 LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 5120 + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 25344 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 82176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 82176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -32151,15 +38067,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [8, 12] + MIWaveTileA: 8 + MIWaveTileB: 12 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 384 + MacroTileA: 256 + MacroTileB: 384 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -32167,27 +38084,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 384 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -32200,15 +38123,19 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 168 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU3_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerUMapping: 0 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -32216,78 +38143,96 @@ SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 12 + ThreadTileA: 32 + ThreadTileB: 12 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 3] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6Ob84Fk3g2GfeTXIjS6iLrhkXQJaWbjD_x56psl77fLo= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 64 + DebugStreamK: 0 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32299,44 +38244,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LCS0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 15872 + LdsBytesNoAmax: 21504 LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 21504 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 + LdsOffsetMetadata: 21504 + LdsOffsetMetadata_Blk: 49664 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -32346,16 +38292,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -32363,27 +38310,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -32396,25 +38349,29 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 169 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU4_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -32424,66 +38381,84 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 4] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1g9dGs1E66kAN2p7-th_lxwNSWH_Chm1_NBwWi3yJrrc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32495,44 +38470,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 47616 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -32542,16 +38518,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -32559,27 +38536,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -32592,25 +38575,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 170 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW1_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32619,39 +38607,48 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true @@ -32659,29 +38656,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32693,24 +38695,24 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 51200 + LdsNumBytes: 59392 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -32719,18 +38721,19 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 + LdsOffsetMetadata: 59392 LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -32741,15 +38744,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [8, 5] + MIWaveTileA: 8 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -32757,10 +38761,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -32770,15 +38778,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -32791,15 +38800,16 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 171 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -32808,11 +38818,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32821,39 +38832,48 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true @@ -32861,28 +38881,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -32895,44 +38920,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 55296 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -32943,15 +38969,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -32959,28 +38986,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -32993,13 +39025,14 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 172 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 512 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 @@ -33010,52 +39043,62 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true @@ -33063,29 +39106,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -33097,44 +39145,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 25088 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -33144,16 +39193,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -33161,10 +39211,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -33174,15 +39228,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -33195,28 +39250,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 173 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33225,39 +39282,48 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 8 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true @@ -33265,29 +39331,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -33299,44 +39370,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 41984 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 25344 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 82176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 82176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -33347,15 +39419,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTile: [8, 12] + MIWaveTileA: 8 + MIWaveTileB: 12 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 + MacroTile0: 256 + MacroTile1: 384 + MacroTileA: 256 + MacroTileB: 384 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -33363,10 +39436,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -33376,15 +39453,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 + NumElementsPerThread: 384 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -33397,15 +39475,16 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 174 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerUMapping: 0 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -33414,81 +39493,97 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 + ThreadTile0: 32 + ThreadTile1: 12 + ThreadTileA: 32 + ThreadTileB: 12 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 8 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT14TrciPaSjmVFDqXIMtv8D7aAZqMZt_UCJaTXYFmC8iM= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -33501,44 +39596,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 LSPA: 4 LSPB: 4 - LVCA: 32 - LVCB: 32 + LVCA: 16 + LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 25600 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -33548,16 +39644,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -33565,10 +39662,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -33580,13 +39681,14 @@ NumElementsPerBatchStore: 16 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 64 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -33599,23 +39701,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 175 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 ThreadTile1: 1 @@ -33629,6 +39733,8 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -33638,59 +39744,72 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6LiWTe0wmd30aFcIdn46EIaDdfK1x_GFEXCVb7BpSEHc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -33703,44 +39822,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -33750,16 +39870,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -33767,10 +39888,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -33780,15 +39905,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 5 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -33801,28 +39927,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 176 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 5 + ThreadTile1: 2 ThreadTileA: 16 - ThreadTileB: 5 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33831,68 +39959,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6FkrtZW54KipGdQlngh-_UO5kc_h4rtDpa1AicW8c6NU= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -33905,44 +40048,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 17408 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -33953,15 +40097,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -33969,10 +40114,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -33982,15 +40131,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -34003,13 +40153,14 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 177 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 @@ -34020,82 +40171,98 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 8 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT9FQSZEwhlZVFR0jJ5mvLJER_VruNLLXDUi3jwInvxP6I= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -34107,44 +40274,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 45056 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 27648 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -34155,15 +40323,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -34171,28 +40340,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 6 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -34205,15 +40379,16 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 178 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -34222,11 +40397,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -34235,39 +40411,48 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true @@ -34275,29 +40460,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -34309,39 +40499,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 17408 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -34356,16 +40547,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -34373,10 +40565,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -34386,15 +40582,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -34407,99 +40604,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 179 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1TgHbyccBoavkzCJUCIK_Dsob-rWNEJ1rKCDRhVz4VkY= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -34511,44 +40725,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 17408 + LdsBytesNoAmax: 23680 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 23680 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 41216 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 23680 + LdsOffsetMetadata_Blk: 41216 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 4 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -34558,16 +40773,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 224 + MacroTileA: 128 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -34575,10 +40791,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -34588,15 +40808,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -34609,99 +40830,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 180 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2Ad1-wA4-xippvDlvO5SK-dLb8b3YW4lpF2AaJ0hK62o= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -34713,44 +40951,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 49152 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 2 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -34760,16 +40999,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 3] + MIWaveTileA: 8 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -34777,10 +41017,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -34790,15 +41034,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -34811,99 +41056,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 181 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 8 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2FCluDWtJTZUPq6sqYQuO1LGnZPkqLUiVWN20T3wYGHk= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -34915,44 +41177,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 2 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -34962,16 +41225,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 48 - MacroTileA: 32 - MacroTileB: 48 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -34979,28 +41243,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 6 - NumLoadsA: 4 - NumLoadsB: 6 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -35013,28 +41282,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 182 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM4_WGMXCC8_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -35043,39 +41314,48 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true @@ -35083,29 +41363,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -35117,44 +41402,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - L1CacheSwizzle: false - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 65024 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 + LdsOffsetMetadata: 0 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 2 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -35164,16 +41450,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 48 - MacroTileA: 32 - MacroTileB: 48 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -35181,28 +41468,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 6 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -35215,93 +41507,115 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 183 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 512 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1NfBMJR2nhSQrXZjPVhfPaTQnJyOkdSXBAf0LKK8lIUw= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 256 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -35310,45 +41624,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 45056 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 27648 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 2 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -35358,16 +41676,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -35375,27 +41694,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 5 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 6 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -35408,60 +41733,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 184 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC0_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM0_WGMXCC1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 512 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -35470,28 +41814,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -35503,7 +41853,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 32 @@ -35512,29 +41863,30 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16896 + LdsBytesNoAmax: 41984 LdsInitCVgprs: false - LdsNumBytes: 16896 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 25344 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 82176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 41216 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 82176 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -35550,15 +41902,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [8, 12] + MIWaveTileA: 8 + MIWaveTileB: 12 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 384 + MacroTileA: 256 + MacroTileB: 384 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -35566,27 +41919,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 384 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -35599,25 +41958,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 185 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 64 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 12 + ThreadTileA: 32 + ThreadTileB: 12 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -35626,65 +41990,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 8 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2E4Wa494AQQUrc_kO5QOfwmr71dBLn-YOWfr04Hn_Zwk= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -35697,43 +42079,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -35743,16 +42127,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 8] + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveTileB: 10 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -35760,27 +42145,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -35793,92 +42184,115 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 186 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 8 + ThreadTile1: 10 ThreadTileA: 16 - ThreadTileB: 8 + ThreadTileB: 10 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6KDLxOnXYs8I2smF7Nmh-XAbsBpQMaPIcE3B67m6uGQQ= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -35891,43 +42305,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 + LdsBytesNoAmax: 41984 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -35937,16 +42353,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 8] + MIWaveGroup: [1, 1] + MIWaveTile: [4, 1] MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -35954,10 +42371,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -35965,16 +42386,18 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -35987,25 +42410,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 187 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO1_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 8 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -36014,36 +42442,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -36052,27 +42491,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -36085,43 +42530,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 + LdsBytesNoAmax: 63488 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 46080 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -36132,15 +42579,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 8] + MIWaveTile: [4, 9] MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveTileB: 9 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 256 + MacroTile1: 288 MacroTileA: 128 - MacroTileB: 256 + MacroTileB: 288 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -36148,27 +42596,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 9 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 9 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -36181,25 +42635,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 188 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x288x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 8 + ThreadTile1: 9 ThreadTileA: 16 - ThreadTileB: 8 + ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -36208,66 +42667,84 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2UFC01hJkw0ZMtJAhkoIrCeCqOPLY-kTrlnUV7c9j6-k= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -36279,38 +42756,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62848 + LdsBytesNoAmax: 65024 LdsInitCVgprs: false - LdsNumBytes: 62848 - LdsNumElementsAlignedA: 29568 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 29568 - LdsOffsetB_Blk: 95104 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 62848 - LdsOffsetMetadata_Blk: 95104 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 0 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -36325,16 +42804,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [14, 4] - MIWaveTileA: 14 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 256 - MacroTileA: 224 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -36342,27 +42822,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 112 - NumLoadsA: 14 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -36375,63 +42861,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 189 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 56 - ThreadTile1: 4 - ThreadTileA: 56 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -36440,28 +42942,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -36473,43 +42981,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_TLDS1_ULSGRO0_USL0_UIOFGRO1_USFGROn1_VSn1_VWA4_VWB2_WSGRA2_WSGRB0_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -36520,15 +43030,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -36536,27 +43047,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -36569,93 +43086,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 190 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW4_TLDS1_ULSGRO0_USL0_UIOFGRO1_USFGROn1_VSn1_VWA4_VWB2_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM304_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true - Use64bShadowLimit: 0 - UseInstOffsetForGRO: 1 + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 304 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2bwD4-IsJyJfSKI9R0SMF8pC4lbDzr-3YFMP6inWAM8I= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -36667,43 +43207,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -36714,15 +43256,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -36730,27 +43273,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -36763,63 +43312,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 191 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -36828,28 +43393,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -36861,43 +43432,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 65024 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 + LdsOffsetMetadata: 0 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -36907,16 +43480,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [5, 14] + MIWaveTileA: 5 + MIWaveTileB: 14 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 320 + MacroTile1: 224 + MacroTileA: 320 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -36924,27 +43498,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 280 + NumLoadsA: 10 + NumLoadsB: 7 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -36957,25 +43537,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 192 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 20 + ThreadTile1: 14 + ThreadTileA: 20 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -36984,36 +43569,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -37022,27 +43618,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 128 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -37055,43 +43657,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -37101,16 +43705,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -37118,10 +43723,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -37129,16 +43738,18 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -37151,63 +43762,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 193 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCG304 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -37216,28 +43843,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 2 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -37249,7 +43882,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -37258,29 +43892,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -37296,15 +43931,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -37312,27 +43948,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -37345,25 +43987,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 194 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU2_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC8_WGMXCCG32 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -37372,36 +44019,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 32 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 2] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -37410,28 +44068,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -37443,38 +44107,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG16_16_1 - LSCA: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 + LdsBytesNoAmax: 58880 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 + LdsOffsetMetadata: 0 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -37489,16 +44155,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 12] + MIWaveTileA: 6 + MIWaveTileB: 12 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 384 + MacroTile1: 192 + MacroTileA: 384 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -37506,27 +44173,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 12 + NumLoadsB: 6 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -37539,63 +44212,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 195 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG16_16_1_WGM4_WGMXCC4_WGMXCCG32 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 24 + ThreadTile1: 12 + ThreadTileA: 24 + ThreadTileB: 12 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -37604,28 +44293,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -37637,38 +44332,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_TLDS1_ULSGRO1_USL0_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 + LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 60928 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -37683,16 +44380,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -37700,27 +44398,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -37733,24 +44437,29 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 196 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW8_TLDS1_ULSGRO1_USL0_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 48 ThreadTile1: 4 - ThreadTileA: 32 + ThreadTileA: 48 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -37759,37 +44468,48 @@ UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true - Use64bShadowLimit: 0 + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 4 VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -37798,28 +44518,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -37831,38 +44557,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 LSCB: 64 - LSPA: 32 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 65024 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 + LdsOffsetMetadata: 0 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -37877,16 +44605,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [5, 14] + MIWaveTileA: 5 + MIWaveTileB: 14 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 320 + MacroTile1: 224 + MacroTileA: 320 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -37894,10 +44623,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -37905,16 +44638,18 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 + NonTemporalWS: 0 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 280 + NumLoadsA: 10 + NumLoadsB: 7 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -37927,25 +44662,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 197 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM304_WGMXCC4_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 + ThreadTile0: 20 + ThreadTile1: 14 + ThreadTileA: 20 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -37954,66 +44694,84 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 304 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1FGf7IE7mM6Q7tTV8V4ozqUy5kMZ4Ggk1B8Ghs8TATVM= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -38025,7 +44783,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -38034,29 +44793,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 55296 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -38072,15 +44832,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -38088,10 +44849,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -38099,16 +44864,18 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -38121,25 +44888,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 198 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -38148,66 +44920,84 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iM5xMfiYM_YfrHylwPKVj3EBmFKlGoiFtAQ1oJRMzCI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -38219,43 +45009,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27520 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 27520 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 10880 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 49408 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27520 - LdsOffsetMetadata_Blk: 49408 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -38265,16 +45057,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -38282,10 +45075,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -38293,16 +45090,18 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -38315,25 +45114,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 199 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM304_WGMXCC1_WGMXCCG4 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -38342,36 +45146,47 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 304 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -38380,28 +45195,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -38413,43 +45234,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23680 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 23680 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 15232 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23680 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -38460,15 +45283,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 7] - MIWaveTileA: 4 - MIWaveTileB: 7 + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 224 - MacroTileA: 128 - MacroTileB: 224 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -38476,27 +45300,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 7 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -38509,93 +45339,116 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 200 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM38_WGMXCC4_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 7 - ThreadTileA: 16 - ThreadTileB: 7 + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 38 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1xvYLVPs-v0Z88DWh0nLMeCHnm-r8I6jVSD5Yikxh94c= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -38607,43 +45460,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23680 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 23680 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 15232 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23680 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -38654,15 +45509,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 7] - MIWaveTileA: 4 - MIWaveTileB: 7 + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 224 - MacroTileA: 128 - MacroTileB: 224 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -38670,27 +45526,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 7 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -38703,63 +45565,79 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 201 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM38_WGMXCC4_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 7 - ThreadTileA: 16 - ThreadTileB: 7 + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 38 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -38768,27 +45646,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DepthU: 32 + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -38801,43 +45685,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCB: 64 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23680 + LdsBytesNoAmax: 65024 LdsInitCVgprs: false - LdsNumBytes: 23680 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 15232 + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23680 - LdsOffsetMetadata_Blk: 41216 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 0 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -38847,16 +45733,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 7] + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] MIWaveTileA: 4 - MIWaveTileB: 7 + MIWaveTileB: 14 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 256 MacroTile1: 224 - MacroTileA: 128 + MacroTileA: 256 MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -38864,10 +45751,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -38875,16 +45766,18 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 112 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 NumLoadsB: 7 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -38897,59 +45790,73 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 202 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM38_WGMXCC4_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 7 + ThreadTile1: 14 ThreadTileA: 16 - ThreadTileB: 7 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 38 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -38964,8 +45871,9 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -38973,17 +45881,21 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 9 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 @@ -38994,42 +45906,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 1 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 60928 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -39044,16 +45958,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [12, 4] + MIWaveTileA: 12 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -39061,10 +45976,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -39073,16 +45992,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -39095,63 +46015,74 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 203 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU9_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM1_WGMXCC304_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 48 ThreadTile1: 4 - ThreadTileA: 16 + ThreadTileA: 48 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 304 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 9] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -39165,29 +46096,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -39195,47 +46131,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 58880 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 58880 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 + LdsOffsetMetadata: 0 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -39245,16 +46183,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [5, 12] + MIWaveTileA: 5 + MIWaveTileB: 12 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 320 + MacroTile1: 192 + MacroTileA: 320 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -39262,10 +46201,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -39274,16 +46217,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 240 + NumLoadsA: 10 + NumLoadsB: 6 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -39296,28 +46240,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 204 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x192x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 20 + ThreadTile1: 12 + ThreadTileA: 20 + ThreadTileB: 12 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -39326,32 +46272,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -39363,31 +46318,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT12_t3NdILnqpQ6T-BmAMhwZMUme66jj1UBjEiaMhlrqQ= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -39396,47 +46357,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 + LdsBytesNoAmax: 23936 LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 23936 + LdsNumElementsAlignedA: 13056 + LdsNumElementsAlignedB: 10880 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13056 + LdsOffsetB_Blk: 45824 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 23936 + LdsOffsetMetadata_Blk: 45824 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -39446,16 +46409,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 5] + MIWaveTileA: 6 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 160 + MacroTileA: 192 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -39463,10 +46427,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -39475,16 +46443,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 120 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 6 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -39497,62 +46466,73 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 205 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCG32 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 64 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 24 + ThreadTile1: 5 + ThreadTileA: 24 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 32 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -39564,32 +46544,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT10JuJieIZAw4u1aR8arami2gX8JEkrU9LM45EsKB036s= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 2 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -39597,47 +46583,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB2048_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA4_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 + LdsBytesNoAmax: 33280 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 33280 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 0 + LdsOffsetB_Blk: 65536 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 65536 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -39647,16 +46635,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -39664,28 +46653,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 4 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -39698,62 +46692,73 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 206 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU2_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG16_16_1_WGM0_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB2048_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA4_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 8 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 2] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -39765,11 +46770,13 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2m1Hm_EUOA4cGHFtFFS72D3vN7_FapmeRjgxj-OExMN0= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -39777,20 +46784,24 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -39798,42 +46809,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT208x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT13_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS13_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -39848,16 +46861,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [13, 2] + MIWaveTileA: 13 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 208 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 208 MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -39865,28 +46879,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerBatchStore: 13 + NumElementsPerThread: 104 + NumGlobalWriteVectorsPerThread: 104 + NumLoadsA: 13 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 13 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -39899,62 +46918,73 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 207 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT208x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT13_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS13_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 52 + ThreadTile1: 2 + ThreadTileA: 52 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -39966,32 +46996,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1xlLNvCacS6nbP0_ceDkkIue6XfsbQS7gqLwsSOXe5Xo= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -39999,47 +47035,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 25216 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 25216 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 41088 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 25216 + LdsOffsetMetadata_Blk: 41088 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -40049,16 +47087,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 256 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -40066,10 +47105,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -40078,16 +47121,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -40100,27 +47144,29 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 208 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 4 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 0 + StaggerUStride: 64 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 4 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -40130,32 +47176,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 8 VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 3 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -40170,28 +47225,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -40200,47 +47260,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA2_WSGRB2_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x352x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 + LdsBytesNoAmax: 40576 LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 40576 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 23936 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 82176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 40576 + LdsOffsetMetadata_Blk: 82176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -40250,16 +47312,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 4] + MIWaveGroup: [2, 2] + MIWaveTile: [8, 11] MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveTileB: 11 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 352 + MacroTileA: 256 + MacroTileB: 352 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -40267,10 +47330,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -40279,16 +47346,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 352 + NumGlobalWriteVectorsPerThread: 44 + NumLoadsA: 8 + NumLoadsB: 11 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 11 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -40301,28 +47369,30 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 209 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA2_WSGRB2_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x352x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 64 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 11 ThreadTileA: 32 - ThreadTileB: 4 + ThreadTileB: 11 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -40331,34 +47401,43 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 1 - ActivationAlt: false + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 @@ -40371,8 +47450,9 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -40380,19 +47460,23 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -40401,18 +47485,19 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 + LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 @@ -40437,6 +47522,7 @@ LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -40461,6 +47547,7 @@ MacroTileA: 192 MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -40468,10 +47555,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -40480,7 +47571,7 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 12 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 6 @@ -40490,6 +47581,7 @@ NumLoadsPerpendicularA: 6 NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -40502,13 +47594,14 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 210 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 @@ -40519,6 +47612,7 @@ SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 48 ThreadTile1: 4 @@ -40527,37 +47621,46 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -40572,8 +47675,9 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -40581,19 +47685,23 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -40602,19 +47710,20 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB2_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 - LSPB: 4 + LSPB: 32 LVCA: 8 LVCB: 8 LVPA: 4 - LVPB: 1 + LVPB: 4 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 @@ -40638,6 +47747,7 @@ LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -40662,6 +47772,7 @@ MacroTileA: 192 MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -40669,10 +47780,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -40681,7 +47796,7 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 12 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 6 @@ -40691,6 +47806,7 @@ NumLoadsPerpendicularA: 6 NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -40703,13 +47819,14 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 211 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB2_WS64_WG16_16_1_WGM16_WGMXCC1_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 @@ -40720,6 +47837,7 @@ SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 48 ThreadTile1: 4 @@ -40733,6 +47851,8 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true @@ -40740,25 +47860,32 @@ VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -40773,8 +47900,9 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -40782,19 +47910,23 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -40803,27 +47935,28 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA2_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x352x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 4 + LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 + LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 + LdsBytesNoAmax: 40576 LdsInitCVgprs: false - LdsNumBytes: 33280 + LdsNumBytes: 40576 LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedB: 23936 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -40832,13 +47965,14 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 + LdsOffsetMetadata: 40576 LdsOffsetMetadata_Blk: 82176 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -40854,15 +47988,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] + MIWaveTile: [8, 11] MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTileB: 11 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 256 + MacroTile1: 352 MacroTileA: 256 - MacroTileB: 256 + MacroTileB: 352 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -40870,28 +48005,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 352 + NumGlobalWriteVectorsPerThread: 44 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 11 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 11 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -40904,14 +48044,15 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 212 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC32_WGMXCCG0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x352x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 64 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 @@ -40921,11 +48062,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 8 + ThreadTile1: 11 ThreadTileA: 32 - ThreadTileB: 8 + ThreadTileB: 11 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -40934,32 +48076,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -40974,8 +48125,9 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -40983,220 +48135,21 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 - LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 213 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1_WGMXCCG0 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 3 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 @@ -41207,42 +48160,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB2_WS64_WG64_4_1 - LSCA: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x240x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 LSCB: 64 - LSPA: 32 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 48640 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 48640 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 38400 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 0 + LdsOffsetB_Blk: 65536 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 48640 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 65536 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -41258,15 +48213,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [4, 6] + MIWaveTile: [4, 15] MIWaveTileA: 4 - MIWaveTileB: 6 + MIWaveTileB: 15 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 96 + MacroTile1: 240 MacroTileA: 256 - MacroTileB: 96 + MacroTileB: 240 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -41274,28 +48230,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 60 NumLoadsA: 8 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 + NumLoadsB: 15 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 15 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -41307,14 +48268,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 214 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU3_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB2_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCG32 + SolutionIndex: 213 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x240x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 @@ -41325,45 +48287,55 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 6 + ThreadTile1: 15 ThreadTileA: 16 - ThreadTileB: 6 + ThreadTileB: 15 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 32 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 3] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -41378,30 +48350,33 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -41410,27 +48385,28 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 1 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53760 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 53760 + LdsNumBytes: 51712 LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -41439,13 +48415,14 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53760 + LdsOffsetMetadata: 51712 LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -41470,6 +48447,7 @@ MacroTileA: 128 MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -41477,19 +48455,23 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 @@ -41499,6 +48481,7 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -41510,14 +48493,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 215 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCG0 + SolutionIndex: 214 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 @@ -41528,6 +48512,7 @@ SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 ThreadTile1: 4 @@ -41541,32 +48526,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -41581,31 +48575,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -41613,11 +48610,12 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -41626,29 +48624,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60416 + LdsBytesNoAmax: 55296 LdsInitCVgprs: false - LdsNumBytes: 60416 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60416 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -41664,15 +48663,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -41680,10 +48680,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -41692,16 +48696,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -41713,16 +48718,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 216 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -41731,45 +48737,55 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -41781,33 +48797,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1P6L0qL2d6KDvXOyAkWH1pyaGYFFpMvS0Kdne04wyJZI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -41816,47 +48836,49 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA2_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 4 + LocalSplitUReuseLDS: 4 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -41866,16 +48888,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [8, 7] MIWaveTileA: 8 MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 128 + MacroTile1: 112 + MacroTileA: 128 + MacroTileB: 112 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -41883,10 +48906,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -41896,15 +48923,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 7 + NumLoadsA: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -41916,24 +48944,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 217 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 ThreadTile1: 7 @@ -41947,32 +48977,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -41984,34 +49023,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1slht0qOsu_JBHGcLlNDV44Oh_QlgFp15qBBQACJo_JY= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -42019,42 +49062,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 45056 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -42070,15 +49115,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -42086,10 +49132,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -42099,15 +49149,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -42119,16 +49170,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 218 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCGn1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -42137,11 +49189,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -42150,71 +49203,84 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 8 - AssertSummationElementMultiple: 32 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT28uEbC94xo9q_s0-Yz_HGWXZ3U4LN2ihaG7obRknrmDU= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false - GlobalReadPerMfma: 0.5 + GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -42226,7 +49292,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p50_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -42235,29 +49302,30 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 63488 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 35840 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -42273,15 +49341,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveTile: [7, 6] + MIWaveTileA: 7 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 224 + MacroTile1: 192 + MacroTileA: 224 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -42289,28 +49358,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 168 + NumLoadsA: 7 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -42322,16 +49396,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 219 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p50_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM40_WGMXCC1_WGMXCCGn1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -42340,11 +49415,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 28 + ThreadTile1: 6 + ThreadTileA: 28 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -42353,32 +49429,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 40 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -42393,31 +49478,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -42429,39 +49517,41 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM0p50_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 4 - LSPB: 4 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 21504 + LdsBytesNoAmax: 25216 LdsInitCVgprs: false - LdsNumBytes: 21504 - LdsNumElementsAlignedA: 13056 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 25216 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 13056 - LdsOffsetB_Blk: 45824 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 41088 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 21504 - LdsOffsetMetadata_Blk: 45824 + LdsOffsetMetadata: 25216 + LdsOffsetMetadata_Blk: 41088 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.5 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 @@ -42475,16 +49565,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 4] - MIWaveTileA: 6 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -42492,28 +49583,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -42525,63 +49621,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 220 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM0p50_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM304_WGMXCC8_WGMXCCG0 + SolutionIndex: 219 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 64 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 + ThreadTile0: 32 ThreadTile1: 4 - ThreadTileA: 24 + ThreadTileA: 32 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 8 VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 304 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -42593,34 +49700,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1ZFGF8Zw_mPOLvGb0xG-UZkAHBcvpAxLuHSVLXfeaOm0= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -42632,38 +49743,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 4 - LSPB: 4 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 21504 + LdsBytesNoAmax: 25216 LdsInitCVgprs: false - LdsNumBytes: 21504 - LdsNumElementsAlignedA: 13056 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 25216 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 13056 - LdsOffsetB_Blk: 45824 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 41088 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 21504 - LdsOffsetMetadata_Blk: 45824 + LdsOffsetMetadata: 25216 + LdsOffsetMetadata_Blk: 41088 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -42678,16 +49791,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 4] - MIWaveTileA: 6 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -42695,28 +49809,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -42728,28 +49847,30 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 221 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM304_WGMXCC8_WGMXCCG0 + SolutionIndex: 220 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 64 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 + ThreadTile0: 32 ThreadTile1: 4 - ThreadTileA: 24 + ThreadTileA: 32 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -42759,32 +49880,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 8 VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 304 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -42796,34 +49926,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1pWdz_BwOcx7OIr1Sb-ym0sXtoR_VErd8Ns5lARLi9gg= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false - GlobalReadPerMfma: 0.5 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 8 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -42835,43 +49969,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p50_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29696 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 29696 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 13056 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 49408 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 29696 - LdsOffsetMetadata_Blk: 49408 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 56832 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -42881,16 +50017,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -42898,10 +50035,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -42911,15 +50052,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 10 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -42931,63 +50073,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 222 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p50_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG304 + SolutionIndex: 221 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -43002,31 +50155,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -43038,39 +50194,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -43085,16 +50242,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -43102,28 +50260,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -43135,29 +50298,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 223 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM9_WGMXCC8_WGMXCCG304 + SolutionIndex: 222 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -43166,32 +50331,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 9 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -43206,31 +50380,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -43242,39 +50419,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 2 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -43289,16 +50467,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -43306,28 +50485,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -43339,63 +50523,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 224 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM9_WGMXCC8_WGMXCCG304 + SolutionIndex: 223 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 9 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -43410,31 +50605,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -43446,39 +50644,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -43493,16 +50692,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -43510,28 +50710,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -43543,63 +50748,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 225 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM9_WGMXCC8_WGMXCCG304 + SolutionIndex: 224 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 9 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -43614,31 +50830,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -43650,44 +50869,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -43697,16 +50917,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -43714,21 +50935,25 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -43736,6 +50961,7 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -43747,63 +50973,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 226 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCG304 + SolutionIndex: 225 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -43818,31 +51055,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -43854,39 +51094,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 34816 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -43901,16 +51142,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -43918,28 +51160,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -43951,29 +51198,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 227 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM9_WGMXCC8_WGMXCCG304 + SolutionIndex: 226 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -43982,32 +51231,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 9 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -44022,31 +51280,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -44058,44 +51319,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59392 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -44105,16 +51367,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -44122,28 +51385,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -44155,66 +51423,77 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 228 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCG304 + SolutionIndex: 227 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 @@ -44226,31 +51505,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -44262,44 +51544,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 - L1CacheSwizzle: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 2 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -44309,16 +51592,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -44326,28 +51610,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -44359,63 +51648,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 229 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCG304 + SolutionIndex: 228 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -44430,12 +51730,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -44443,18 +51744,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -44466,44 +51769,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT20_4_MO40_NTn1_NTA0_NTB0_NTC3_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 38016 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 38016 - LdsNumElementsAlignedA: 21120 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 21120 - LdsOffsetB_Blk: 86656 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38016 - LdsOffsetMetadata_Blk: 86656 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -44513,16 +51817,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [20, 4] - MIWaveTileA: 20 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 256 - MacroTileA: 320 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -44530,28 +51835,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 6 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 320 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 10 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -44563,29 +51873,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 230 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT20_4_MO40_NTn1_NTA0_NTB0_NTC3_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1_WGMXCCG0 + SolutionIndex: 229 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 4 - ThreadTileA: 80 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -44594,33 +51906,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -44635,12 +51955,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -44648,18 +51969,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -44671,23 +51994,24 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB0_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB2_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 61440 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -44696,14 +52020,14 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 + LdsOffsetMetadata: 56832 LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -44718,16 +52042,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -44735,28 +52060,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 10 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -44768,29 +52098,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 231 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB0_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -44799,33 +52131,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: false + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -44840,12 +52180,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -44853,16 +52194,18 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false @@ -44876,44 +52219,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA2_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 8 - LVCB: 8 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 25344 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 25344 - LdsOffsetB_Blk: 90880 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 90880 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -44923,16 +52267,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [12, 8] - MIWaveTileA: 12 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 256 - MacroTileA: 384 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -44940,28 +52285,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 384 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 12 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -44973,29 +52323,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 232 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 8 - ThreadTileA: 48 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -45004,33 +52356,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -45042,15 +52402,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1oTDkQQypLheBUT2qs5SDQPh8gWFwRaGrul5it-MXNdc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -45058,18 +52420,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -45081,44 +52445,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA1_NTB0_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -45128,16 +52493,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -45145,28 +52511,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -45178,64 +52549,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 233 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA1_NTB0_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -45250,12 +52631,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -45263,18 +52645,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 8 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -45286,38 +52670,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 8 - LVCB: 8 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 59392 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -45333,16 +52718,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -45350,21 +52736,25 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 3 NumLoadsA: 8 NumLoadsB: 6 NumLoadsCoalescedA: 1 @@ -45372,6 +52762,7 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -45383,64 +52774,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 234 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: false + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -45455,12 +52856,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -45468,18 +52870,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -45491,38 +52895,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 8 - LVCB: 8 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -45538,16 +52943,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -45555,10 +52961,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -45567,16 +52977,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -45588,29 +52999,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 235 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -45619,33 +53032,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 4 VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -45657,15 +53078,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1-Wvm7B_KLJ-7eE0keIXZ9uaizSPgxXNe3SxiLI9ZqsY= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -45673,17 +53096,19 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -45696,44 +53121,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x448x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_7_MO40_NTn1_NTA0_NTB0_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA2_WSGRB2_WS64_WG16_16_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43136 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 43136 - LdsNumElementsAlignedA: 12672 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 12672 - LdsOffsetB_Blk: 78208 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43136 - LdsOffsetMetadata_Blk: 78208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -45743,16 +53169,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 7] - MIWaveTileA: 12 - MIWaveTileB: 7 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 448 - MacroTileA: 192 - MacroTileB: 448 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -45760,28 +53187,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 336 - NumGlobalWriteVectorsPerThread: 84 - NumLoadsA: 6 - NumLoadsB: 14 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -45793,29 +53225,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 236 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x448x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_7_MO40_NTn1_NTA0_NTB0_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA2_WSGRB2_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCG304 + SolutionIndex: 235 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 7 - ThreadTileA: 48 - ThreadTileB: 7 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -45824,37 +53258,45 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -45862,15 +53304,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1zZn7tcTdDjNv58vlUkTdkM_bB853dBTI9hs8Y1z0J9U= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -45878,18 +53322,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -45901,38 +53347,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -45948,16 +53395,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 16 + MacroTile1: 48 + MacroTileA: 16 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -45965,28 +53413,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 3 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 2 NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -45998,29 +53451,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 237 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -46029,33 +53484,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -46070,9 +53533,10 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 @@ -46083,18 +53547,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -46106,33 +53572,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC3_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -46154,15 +53621,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 192 + MacroTile1: 96 + MacroTileA: 192 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -46170,28 +53638,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 6 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -46203,16 +53676,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 238 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC3_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCG304 + SolutionIndex: 237 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 8 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -46221,11 +53695,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -46234,33 +53709,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: false + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -46272,15 +53755,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1uZ5oXWgBl2h_92G_wYsTJshu3OR-Vkx_aNI8kFOkdTU= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -46288,17 +53773,19 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -46311,44 +53798,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB2_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 LSPA: 32 - LSPB: 4 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 1 - LdsBlockSizePerPadA: 512 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 25344 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -46358,16 +53846,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 12] + MIWaveGroup: [1, 4] + MIWaveTile: [8, 2] MIWaveTileA: 8 - MIWaveTileB: 12 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 384 - MacroTileA: 256 - MacroTileB: 384 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -46375,28 +53864,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 384 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -46408,103 +53902,116 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 239 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS64_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCG304 + SolutionIndex: 238 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 64 - StorePriorityOpt: 1 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 12 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -46516,44 +54023,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x288x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 36224 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 36224 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 19584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 82176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 36224 + LdsOffsetMetadata_Blk: 82176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -46564,15 +54072,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveTile: [8, 9] + MIWaveTileA: 8 + MIWaveTileB: 9 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -46580,29 +54089,34 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 8 + NumLoadsB: 9 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] @@ -46613,16 +54127,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 240 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x288x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -46631,85 +54146,97 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 32 + ThreadTile1: 9 + ThreadTileA: 32 + ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -46721,33 +54248,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 - LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 + LdsInitCVgprs: false + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -46757,8 +54285,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -46768,16 +54296,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -46785,29 +54314,34 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] @@ -46818,64 +54352,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 241 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -46890,31 +54434,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -46926,38 +54473,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA1_NTB2_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -46973,16 +54521,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -46990,29 +54539,34 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 - OptNoLoadLoop: 2 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] @@ -47023,64 +54577,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 242 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA1_NTB2_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -47095,31 +54659,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false - GlobalReadPerMfma: 0.27 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -47131,40 +54698,41 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p27_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM0p40_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB2_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 34304 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 4 @@ -47178,16 +54746,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -47195,28 +54764,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 32 - NumLoadsB: 28 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 28 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -47228,64 +54802,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 243 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p27_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM0p40_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCG4 + SolutionIndex: 242 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 8 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -47300,29 +54884,32 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false @@ -47336,38 +54923,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -47383,16 +54971,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -47400,29 +54989,34 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 - OptNoLoadLoop: 2 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] @@ -47433,29 +55027,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 244 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM190_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 243 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -47464,33 +55060,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 190 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 3 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -47505,31 +55109,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 8 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -47541,38 +55148,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB1_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB2_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -47588,16 +55196,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -47605,29 +55214,34 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 4 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 6 NumThreads: 256 - OptNoLoadLoop: 2 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] @@ -47638,64 +55252,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 245 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB1_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM2_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 2 - StaggerUStride: 1024 - StorePriorityOpt: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 3 - tailLoopOpt: false + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -47710,31 +55334,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -47746,38 +55373,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 34304 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -47793,16 +55421,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -47810,28 +55439,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -47843,64 +55477,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 246 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU4_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 4 - StaggerUMapping: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 6 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -47912,12 +55556,14 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2bFhVBNu0V0U_NjDudmvQExRXbPXVn2IGCTrajNcbha8= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 @@ -47925,20 +55571,22 @@ DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -47951,40 +55599,41 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM0p40_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 2 - LVCA: 32 - LVCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 4 - LVPB: 1 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 4 @@ -47999,15 +55648,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] + MIWaveTile: [8, 4] MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 224 + MacroTile1: 128 MacroTileA: 256 - MacroTileB: 224 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -48015,10 +55665,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -48027,16 +55681,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 32 - NumLoadsB: 28 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 28 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -48048,15 +55703,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 247 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM0p40_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1_WGM12_WGMXCC4_WGMXCCG0 + SolutionIndex: 246 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 1 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 @@ -48066,46 +55722,55 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 7 + ThreadTile1: 4 ThreadTileA: 32 - ThreadTileB: 7 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 12 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -48120,9 +55785,10 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 @@ -48130,21 +55796,23 @@ DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false - GlobalReadPerMfma: 0.3 - GlobalReadVectorWidthA: 2 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 8 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -48156,37 +55824,38 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p30_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO3_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65280 + LdsBytesNoAmax: 55296 LdsInitCVgprs: false - LdsNumBytes: 65280 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 32256 + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 65280 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 8 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 @@ -48204,15 +55873,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -48220,29 +55890,34 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 32 - NumLoadsB: 7 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] @@ -48253,16 +55928,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 248 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p30_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO3_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 3 - StoreVectorWidth: 8 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -48271,11 +55947,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -48284,33 +55961,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -48325,31 +56010,34 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false - GlobalReadPerMfma: 0.26 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 8 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -48361,44 +56049,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -48409,15 +56098,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -48425,29 +56115,34 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 32 - NumLoadsB: 28 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 28 - NumThreads: 256 - OptNoLoadLoop: 0 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] @@ -48458,16 +56153,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 249 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM2_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 2 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -48476,11 +56172,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -48489,33 +56186,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -48527,12 +56232,14 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1EfssSWzlMoW1jWxgMIrFc4c0Gx2X3hCeVvGnf-bvK1U= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 @@ -48540,19 +56247,21 @@ DirectToLdsA: false DirectToLdsB: false DirectToVgprA: 0 - DirectToVgprB: false + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false @@ -48566,37 +56275,38 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 LVCB: 8 LVPA: 4 - LVPB: 1 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65280 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 65280 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 32256 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 65280 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 8 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 @@ -48613,16 +56323,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -48630,29 +56341,34 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 32 - NumLoadsB: 7 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] @@ -48663,102 +56379,115 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 250 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 7 + ThreadTile1: 4 ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -48771,44 +56500,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x352x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 40576 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 40576 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 23936 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 82176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 40576 + LdsOffsetMetadata_Blk: 82176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -48819,15 +56549,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] + MIWaveTile: [8, 11] MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTileB: 11 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 192 + MacroTile1: 352 MacroTileA: 256 - MacroTileB: 192 + MacroTileB: 352 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -48835,28 +56566,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 352 + NumGlobalWriteVectorsPerThread: 44 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 11 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 11 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -48868,14 +56604,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 251 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x352x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 @@ -48886,85 +56623,97 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 6 + ThreadTile1: 11 ThreadTileA: 32 - ThreadTileB: 6 + ThreadTileB: 11 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -48976,44 +56725,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 40448 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -49023,16 +56773,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 80 + MacroTileA: 64 + MacroTileB: 80 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -49040,28 +56791,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 10 + NumLoadsA: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -49073,29 +56829,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 252 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -49104,72 +56862,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -49181,44 +56950,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 25344 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 56832 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -49228,16 +56998,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 12] - MIWaveTileA: 8 - MIWaveTileB: 12 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 384 - MacroTileA: 256 - MacroTileB: 384 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -49245,28 +57016,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 384 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 10 NumLoadsA: 8 - NumLoadsB: 12 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -49278,29 +57054,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 253 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 12 - ThreadTileA: 32 - ThreadTileB: 12 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -49309,49 +57087,58 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -49359,22 +57146,24 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -49386,37 +57175,38 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB2_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 4 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 @@ -49433,16 +57223,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 224 + MacroTile1: 128 MacroTileA: 256 - MacroTileB: 224 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -49450,28 +57241,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -49483,29 +57279,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 254 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -49514,72 +57312,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -49591,44 +57400,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -49638,16 +57448,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -49655,28 +57466,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -49688,102 +57504,115 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 255 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -49796,38 +57625,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 91648 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -49843,16 +57673,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 4] - MIWaveTileA: 12 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -49860,28 +57691,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -49893,102 +57729,115 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 256 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 16 ThreadTile1: 4 - ThreadTileA: 48 + ThreadTileA: 16 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: false + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -50001,39 +57850,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33280 LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -50048,16 +57898,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] + MIWaveGroup: [1, 2] + MIWaveTile: [8, 3] MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -50065,21 +57916,25 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 6 NumLoadsA: 8 NumLoadsB: 6 NumLoadsCoalescedA: 1 @@ -50087,6 +57942,7 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -50098,29 +57954,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 257 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 6 + ThreadTile1: 3 ThreadTileA: 32 - ThreadTileB: 6 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -50129,72 +57987,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -50206,39 +58075,40 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false @@ -50253,16 +58123,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -50270,28 +58141,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -50303,29 +58179,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 258 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -50334,71 +58212,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1_PSQy59sMz-eVIcifQvUuxv2J6-XcExl1p9GY_zKX3U= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -50411,44 +58301,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 25344 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 25344 - LdsOffsetB_Blk: 90880 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 90880 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -50458,16 +58349,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [12, 8] - MIWaveTileA: 12 - MIWaveTileB: 8 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 256 - MacroTileA: 384 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -50475,28 +58367,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 384 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 12 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -50508,102 +58405,115 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 259 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 8 - ThreadTileA: 48 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 8 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -50616,44 +58526,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA2_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 25344 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 25344 - LdsOffsetB_Blk: 90880 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 90880 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 56832 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 2 + LocalSplitUReuseLDS: 2 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -50663,16 +58574,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [12, 8] - MIWaveTileA: 12 - MIWaveTileB: 8 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 256 - MacroTileA: 384 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -50680,28 +58592,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 384 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 12 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 10 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -50713,29 +58630,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 260 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS64_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 64 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 8 - ThreadTileA: 48 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -50744,49 +58663,58 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false - - 1LDSBuffer: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 @@ -50794,22 +58722,24 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -50821,34 +58751,35 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 LSCB: 64 - LSPA: 32 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 65024 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 65024 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 + LdsOffsetMetadata: 0 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 @@ -50868,16 +58799,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [4, 1] + MIWaveTile: [5, 14] + MIWaveTileA: 5 + MIWaveTileB: 14 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 320 + MacroTile1: 224 + MacroTileA: 320 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -50885,28 +58817,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 280 + NumLoadsA: 10 + NumLoadsB: 7 + NumLoadsCoalescedA: 2 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -50918,103 +58855,116 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 261 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 20 + ThreadTile1: 14 + ThreadTileA: 20 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 - AssignedDerivedParameters: true + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default + CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -51026,38 +58976,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB2_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 59392 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -51073,16 +59024,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -51090,28 +59042,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -51123,29 +59080,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 262 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -51154,47 +59113,57 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2tkAs12w_Ywsz9iiW5cO7RSYL3uE4bWJ3ZIRu-rOR1h8= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false @@ -51204,22 +59173,24 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -51231,33 +59202,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA2_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 32 + LSPA: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 1 + LVCB: 16 + LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 37376 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 2560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -51278,16 +59250,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -51295,28 +59268,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -51328,29 +59306,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 263 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -51359,72 +59339,84 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 2 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6ehIJgJlyLJS4ze08Pe2BHW64BmXlx8yZwNxtSU8wjMw= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -51436,44 +59428,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -51483,16 +59476,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -51500,28 +59494,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 16 - NumLoadsB: 16 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -51533,29 +59532,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 264 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -51564,72 +59565,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -51641,44 +59653,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 57856 - LdsOffsetMetadata_Blk: 88576 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 4 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -51688,16 +59701,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 4] - MIWaveTileA: 10 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 112 + MacroTileA: 128 + MacroTileB: 112 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -51705,28 +59719,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 7 + NumLoadsA: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -51738,29 +59757,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 265 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x112x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 4 - ThreadTileA: 40 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -51769,72 +59790,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -51846,44 +59878,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA2_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 256 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55296 + LdsBytesNoAmax: 25216 LdsInitCVgprs: false - LdsNumBytes: 55296 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 25216 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 41088 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55296 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 25216 + LdsOffsetMetadata_Blk: 41088 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -51893,16 +59926,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 6] - MIWaveTileA: 6 - MIWaveTileB: 6 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -51910,28 +59944,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 6 - NumLoadsB: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -51943,78 +59982,89 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 266 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 + StaggerUMapping: 0 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 6 - ThreadTileA: 24 - ThreadTileB: 6 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: false + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false @@ -52024,22 +60074,24 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -52051,7 +60103,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -52060,24 +60113,24 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 63488 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 35840 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -52099,15 +60152,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTile: [6, 7] + MIWaveTileA: 6 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 192 + MacroTile1: 224 + MacroTileA: 192 + MacroTileB: 224 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -52115,28 +60169,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 168 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 6 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 7 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -52148,16 +60207,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 267 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -52166,11 +60226,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 24 + ThreadTile1: 7 + ThreadTileA: 24 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -52179,71 +60240,83 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT25Cd44YvXxmUj-qYRqzI2CtxbAD5HM2Y0Bsr0k0pKd6Y= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -52256,44 +60329,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 25088 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -52304,15 +60378,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] + MIWaveTile: [8, 4] MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 192 + MacroTile1: 128 MacroTileA: 256 - MacroTileB: 192 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -52320,28 +60395,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -52353,14 +60433,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 268 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 64 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 @@ -52371,11 +60452,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 32 - ThreadTileB: 6 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -52384,47 +60466,57 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6koWquPjoHCs1Oma2AObVEt3w56xaxon-NIAIo4C1TTE= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false @@ -52434,22 +60526,24 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -52461,37 +60555,38 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 39424 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 30464 - LdsNumElementsAlignedB: 33024 + LdsNumBytes: 39424 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 30464 - LdsOffsetB_Blk: 96000 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 96000 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 39424 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 @@ -52508,16 +60603,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 8] - MIWaveTileA: 7 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 256 - MacroTileA: 224 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -52525,28 +60621,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 224 - NumLoadsA: 28 - NumLoadsB: 32 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 28 - NumLoadsPerpendicularB: 32 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -52558,102 +60659,116 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 269 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 8 - ThreadTileA: 28 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 8 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1gm9sxANwKkJxUZ5OPxPVswfhQaPnBbTUxcYB7NLBAn0= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -52666,34 +60781,35 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA4_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 0 + LdsOffsetB_Blk: 32768 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 32768 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 @@ -52702,8 +60818,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -52713,16 +60829,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -52730,28 +60847,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 4 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -52763,103 +60885,117 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 270 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA4_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: true + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 16 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6vk03BoZH6obuT3awSPqG468SuNhlH-Yb-cTTxQKqPLI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -52871,33 +61007,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -52907,8 +61044,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -52918,16 +61055,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -52935,28 +61073,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -52968,29 +61111,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 271 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM16_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 4 + ThreadTile1: 3 + ThreadTileA: 4 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -52999,33 +61144,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -53037,15 +61190,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3f6bKPdXenk8Z-ZAIf5o1eN9nroA7XJ4UcT56oTXlucg= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53053,18 +61208,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -53076,45 +61233,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 25344 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -53125,15 +61282,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 12] - MIWaveTileA: 8 - MIWaveTileB: 12 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 384 - MacroTileA: 256 - MacroTileB: 384 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -53141,28 +61299,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 384 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -53174,16 +61337,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 272 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 271 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -53192,11 +61356,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 12 - ThreadTileA: 32 - ThreadTileB: 12 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -53205,33 +61370,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -53243,15 +61416,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT8jjxPn5Zz_jcdmghzDQCl0wxtNz90pizOJqu21Izxk0w= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53259,18 +61434,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -53282,45 +61459,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 36992 + LdsBytesNoAmax: 41472 LdsInitCVgprs: false - LdsNumBytes: 36992 - LdsNumElementsAlignedA: 21760 - LdsNumElementsAlignedB: 15232 + LdsNumBytes: 41472 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 21760 - LdsOffsetB_Blk: 87296 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 36992 - LdsOffsetMetadata_Blk: 87296 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 41472 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -53330,16 +61507,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [10, 7] - MIWaveTileA: 10 - MIWaveTileB: 7 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 224 - MacroTileA: 320 - MacroTileB: 224 + MacroTile0: 80 + MacroTile1: 64 + MacroTileA: 80 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -53347,28 +61525,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 280 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 10 - NumLoadsB: 7 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 5 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -53380,64 +61563,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 273 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT320x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS64_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 272 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT80x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 64 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 7 - ThreadTileA: 40 - ThreadTileB: 7 + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -53449,15 +61642,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1QUT6_s-nDeWKasVoqZMXKk6b1pb253Y55lIbzqqwVyg= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53465,18 +61660,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -53488,39 +61685,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 25600 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 41472 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -53536,16 +61733,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -53553,28 +61751,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -53586,64 +61789,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 274 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 273 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -53655,15 +61868,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1Ro6bB5Afv2Qw4JFpy1qjbeWIf6tRFcO6bvTsyopaSlQ= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53671,18 +61886,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -53694,34 +61911,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 63488 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 63488 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 111616 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 63488 + LdsOffsetMetadata_Blk: 111616 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -53731,8 +61948,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -53743,15 +61960,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -53759,28 +61977,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 15 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 10 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -53792,16 +62015,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 275 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 274 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS15_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -53810,11 +62034,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -53823,33 +62048,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -53861,9 +62094,11 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2GtId_NQBoUG9Chu4LpZVygW0ZFAQ2gByrrIz1UnaROM= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false @@ -53877,18 +62112,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -53900,8 +62137,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -53910,24 +62147,24 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 39424 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 39424 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 39424 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -53948,16 +62185,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 192 + MacroTile1: 32 MacroTileA: 256 - MacroTileB: 192 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -53965,28 +62203,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 1 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -53998,29 +62241,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 276 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 275 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -54029,33 +62274,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -54070,12 +62323,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -54083,18 +62337,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -54106,45 +62362,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -54154,16 +62410,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -54171,28 +62428,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -54204,64 +62466,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 277 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 276 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -54276,12 +62548,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -54289,18 +62562,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -54312,39 +62587,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 59392 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -54360,16 +62635,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -54377,21 +62653,25 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 3 NumLoadsA: 8 NumLoadsB: 6 NumLoadsCoalescedA: 1 @@ -54399,6 +62679,7 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -54410,29 +62691,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 278 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 277 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -54441,33 +62724,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -54479,15 +62770,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1xDQCHoBtWGcvqG--wCBApiEn7upkMuD4ptTgcQrAx5M= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -54495,18 +62788,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -54518,24 +62813,24 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 61440 + LdsNumBytes: 38400 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -54544,13 +62839,13 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 + LdsOffsetMetadata: 38400 LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -54566,16 +62861,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -54583,28 +62879,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 1 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -54616,64 +62917,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 279 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 278 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -54685,15 +62996,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1qXrysDQaQSyY_GvNMymmXQ6H5oOxC2KzfCcv9tC-89c= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -54701,18 +63014,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -54724,39 +63039,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -54772,16 +63087,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -54789,28 +63105,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -54822,29 +63143,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 280 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 279 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -54853,33 +63176,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -54894,12 +63225,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -54907,18 +63239,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -54930,45 +63264,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -54978,16 +63312,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -54995,28 +63330,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -55028,64 +63368,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 281 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 280 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -55097,15 +63447,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT10wKlkcytgQzsH8lAxfnxveAuAcZlauUDHie645M92G8= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -55113,17 +63465,19 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -55136,45 +63490,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 25344 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 25344 - LdsOffsetB_Blk: 90880 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 90880 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -55185,15 +63539,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [12, 8] - MIWaveTileA: 12 - MIWaveTileB: 8 + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 256 - MacroTileA: 384 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -55201,28 +63556,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 384 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 12 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -55234,14 +63594,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 282 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 281 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 64 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 0 @@ -55252,46 +63613,55 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 8 - ThreadTileA: 48 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 8 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -55303,15 +63673,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6Oumjg4_i1jHt9lwHaq_nRYi1jfS94Cj3bPpQnk3iA9w= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -55319,18 +63691,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -55342,45 +63716,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 25344 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 25344 - LdsOffsetB_Blk: 90880 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 90880 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 16 + LoopUnroll: 256 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -55391,15 +63765,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [12, 8] - MIWaveTileA: 12 - MIWaveTileB: 8 + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 256 - MacroTileA: 384 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -55407,28 +63782,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 384 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 12 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -55440,16 +63820,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 283 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT384x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT12_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS64_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 282 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 64 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -55458,46 +63839,55 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 8 - ThreadTileA: 48 - ThreadTileB: 8 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -55509,15 +63899,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT6_bpvSUjRpM3vb1RD4OyrAJfXhF4IA_Ivvfyo1fxMbyI= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -55525,18 +63917,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -55548,34 +63942,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - L1CacheSwizzle: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -55585,8 +63979,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 16 + LoopUnroll: 256 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -55596,16 +63990,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -55613,28 +64008,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -55646,64 +64046,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 284 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LCS0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 283 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -55715,15 +64125,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT3mXXvlyk2mDYTYtDXM98mMa8ehLVvxYIjN-zWBjYINfY= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -55731,18 +64143,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -55754,38 +64168,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 25600 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49664 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -55801,16 +64216,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -55818,28 +64234,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -55851,64 +64272,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 285 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 284 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -55923,12 +64354,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -55936,18 +64368,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -55959,38 +64393,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -56006,16 +64441,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -56023,10 +64459,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -56036,15 +64476,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -56056,29 +64497,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 286 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG304 + SolutionIndex: 285 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -56087,33 +64530,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -56125,15 +64576,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1Oz1TuvaPeWWTi5-WZD7mbIaCeD02DbTugx-Hq5VtXmg= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56141,21 +64594,23 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 @@ -56164,44 +64619,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -56211,16 +64667,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -56228,28 +64685,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -56261,29 +64723,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 287 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 286 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -56292,33 +64756,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 512 + _DepthUA: 512 + _DepthUB: 512 + _DepthUMetadata: 512 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -56333,12 +64805,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56346,18 +64819,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -56369,38 +64844,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA1_NTB0_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -56416,16 +64892,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -56433,28 +64910,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -56466,29 +64948,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 288 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA1_NTB0_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 287 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -56497,33 +64981,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -56535,15 +65027,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2KAu6MKwGF3OIWw0WeHtQc2SHBohuEA-flWPzMyUelDU= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56551,16 +65045,18 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false @@ -56574,44 +65070,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 41984 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 41984 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 25344 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 82176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 41984 + LdsOffsetMetadata_Blk: 82176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -56622,15 +65119,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] + MIWaveTile: [8, 12] MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTileB: 12 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 192 + MacroTile1: 384 MacroTileA: 256 - MacroTileB: 192 + MacroTileB: 384 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -56638,28 +65136,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerThread: 384 + NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 12 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -56671,15 +65174,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 289 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 288 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_12_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 64 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 @@ -56689,11 +65193,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 6 + ThreadTile1: 12 ThreadTileA: 32 - ThreadTileB: 6 + ThreadTileB: 12 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -56702,33 +65207,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -56743,12 +65256,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56756,18 +65270,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -56779,33 +65295,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB2_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 61440 LdsInitCVgprs: false LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -56815,8 +65332,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -56827,15 +65344,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -56843,28 +65361,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -56876,16 +65399,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 290 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB2_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 289 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -56894,11 +65418,12 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -56907,33 +65432,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -56945,15 +65478,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1KJHCixFmk1v5eu9k6Y0ZU0Rtp14UNJreK8-Iv4XdOTc= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -56961,18 +65496,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -56984,44 +65521,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -57032,15 +65570,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -57048,28 +65587,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -57081,16 +65625,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 291 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 + SolutionIndex: 290 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -57099,46 +65644,55 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -57153,9 +65707,10 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 @@ -57166,18 +65721,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -57189,7 +65746,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -57198,24 +65756,24 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 45056 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -57237,15 +65795,16 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -57253,28 +65812,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -57286,16 +65850,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 292 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 291 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -57304,46 +65869,55 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -57358,12 +65932,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57371,17 +65946,19 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -57394,38 +65971,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -57441,16 +66019,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -57458,29 +66037,34 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] @@ -57491,64 +66075,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 293 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 292 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -57563,12 +66157,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57576,18 +66171,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -57599,38 +66196,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -57646,16 +66244,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -57663,28 +66262,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -57696,29 +66300,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 294 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG304 + SolutionIndex: 293 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -57727,33 +66333,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -57768,12 +66382,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57781,18 +66396,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -57804,38 +66421,39 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 34304 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalReadVectorWidth: 8 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -57851,16 +66469,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -57868,28 +66487,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -57901,64 +66525,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 295 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 294 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -57973,12 +66607,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57986,18 +66621,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -58009,44 +66646,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -58056,16 +66694,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -58073,29 +66712,34 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 - OptNoLoadLoop: 0 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] @@ -58106,29 +66750,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 296 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 295 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 + StaggerU: 16 + StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -58137,33 +66783,41 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true + _staggerStrideShift: 0 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -58178,12 +66832,13 @@ BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -58191,17 +66846,19 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -58214,44 +66871,45 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 56320 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 + LocalSplitU: 4 + LocalSplitUReuseLDS: 3 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: true MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -58261,16 +66919,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] + MIWaveGroup: [1, 1] + MIWaveTile: [8, 5] MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -58278,10 +66937,14 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -58290,16 +66953,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 5 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -58311,64 +66975,74 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 297 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 296 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 6 + ThreadTile1: 5 ThreadTileA: 32 - ThreadTileB: 6 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 8 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -58380,12 +67054,14 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2TDsJcT35jUzckiCFvVD9pweD51ZT0ENcCP3gKGokg9I= BufferLoad: true BufferStore: true CUCount: null + CUOccupancy: -1 ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 @@ -58396,18 +67072,20 @@ DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -58419,7 +67097,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -58428,24 +67107,24 @@ LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -58466,16 +67145,17 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 @@ -58483,28 +67163,33 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumThreads: 256 + NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -58516,29 +67201,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 298 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 297 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -58547,13597 +67234,1710 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false UseInstOffsetForGRO: 0 UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 299 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 2 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 - LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 35840 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 35840 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: 0 - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 5] - MIWaveTileA: 7 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 160 - MacroTileA: 224 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 140 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 7 - NumLoadsB: 20 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 20 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 300 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU2_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 5 - ThreadTileA: 28 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 2] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 512 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LSCA: 512 - LSCB: 512 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50688 - LdsInitCVgprs: false - LdsNumBytes: 50688 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 - MFMA_BF16_1K: true - MIArchVgpr: 1 - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 301 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 512 - _DepthUA: 512 - _DepthUB: 512 - _DepthUMetadata: 512 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 16 - AssertFree1ElementMultiple: 16 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 - LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 302 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM16_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 - StaggerUStride: 128 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT64X2YoLgwgkNmk2QgXSj_lms81hXJK4Emcrk5tU9JkZik= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.3 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p30_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41472 - LdsInitCVgprs: false - LdsNumBytes: 41472 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 83968 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41472 - LdsOffsetMetadata_Blk: 83968 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 5] - MIWaveTileA: 1 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 80 - MacroTileA: 64 - MacroTileB: 80 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 303 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p30_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCG0 - SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 - StaggerUStride: 1024 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 5 - ThreadTileA: 4 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT64dz-QcBO8G3xPp4z_JyBIuufyq-m_hhfK6wvtnd_3uUg= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.8 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p27_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC7_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 11776 - LdsInitCVgprs: false - LdsNumBytes: 11776 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 11776 - LdsOffsetMetadata_Blk: 25600 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.27 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 304 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p27_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC7_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM4_SUS128_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCG0 - SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 4 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.6 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p60_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 - LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 4] - MIWaveTileA: 6 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 305 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p60_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 4 - ThreadTileA: 24 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT25DmIimxyKAym_8vtFOOxiflc3vmf6nh1ie9SgwrMwY_k= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59392 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 306 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG304 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT126VlbQ658zPSisyZEnBWlthGSRo43qt5FQgXLm-ubwkc= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.5 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p50_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 - LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 307 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p50_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.5 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p50_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA2_WSGRB2_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 - LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 57856 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 10] - MIWaveTileA: 4 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 308 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p50_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU64_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA2_WSGRB2_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 64 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 10 - ThreadTileA: 16 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.7 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p70_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA2_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59392 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 2 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 309 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p70_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU64_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM12_WGMXCC4_WGMXCCG0 - SourceSwap: 1 - StaggerU: 64 - StaggerUMapping: 1 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 12 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT12YEX6OzAt39bMuIrZ6RBN-duuGLdI0FLeSGfWjXPv-_0= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.8 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 - LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 57856 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 310 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC4_WGMXCCG0 - SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT64jaPr1-PSHCqfGwOnG3gEB7CzVRLr02kxeILFbAALRbw= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.4 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p40_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p90_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 - LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.9 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 311 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p40_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p90_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM3_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG0 - SourceSwap: 1 - StaggerU: 2 - StaggerUMapping: 3 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT25DmIimxyKAym_8vtFOOxiflc3vmf6nh1ie9SgwrMwY_k= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.26 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59392 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 312 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59392 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 313 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT12Y2LWyY45KwUl8NahONXKp2n10XHogRMaJ0ea2f_hZqg= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.26 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 - LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 314 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT25kWeE_QzwHeVDy1Whgi17IXYJYFwoH510gEr7XAtnOcA= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.9 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p90_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB4_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 - LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 315 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p90_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB4_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 2 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT194ljiXSn3oJV4-dk-EdKl0gT9Bf8vuLg1V-PjS75Jxi0= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.8 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 - LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 4] - MIWaveTileA: 6 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 316 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 2 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 4 - ThreadTileA: 24 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT3266EJEtFWLzFAXToAaI9uJhBhRomkxa6PwNmnYRrzv6k= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 1 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.27 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM0p27_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW8_LWPM0p80_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA2_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 2 - LVCB: 4 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 4608 - LdsInitCVgprs: false - LdsNumBytes: 4608 - LdsNumElementsAlignedA: 0 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 0 - LdsOffsetB_Blk: 8192 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 8192 - LdsPadA: 0 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.8 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 32 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [32, 32, 8, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 2 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 317 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM0p27_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW8_LWPM0p80_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA2_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_1_WGM266_WGMXCC1_WGMXCCG0 - SourceSwap: 1 - StaggerU: 2 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 2, 1] - WorkGroupMapping: 266 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: true - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT25DHG7xTSClPbpv_I4DaDPsO0pMGVgoWabAfCIjqTh6sg= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 318 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG304 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT19I6ohdwYE1BPVeyVj7FCZ5ulvsY9MDpEBibsFmpk0jrA= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.7 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p70_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_10_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA2_WSGRB2_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53760 - LdsInitCVgprs: false - LdsNumBytes: 53760 - LdsNumElementsAlignedA: 30720 - LdsNumElementsAlignedB: 23040 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 30720 - LdsOffsetB_Blk: 96256 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53760 - LdsOffsetMetadata_Blk: 96256 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [3, 10] - MIWaveTileA: 3 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 160 - MacroTileA: 192 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 120 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 6 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 319 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p70_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_10_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM3_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA2_WSGRB2_WS64_WG64_4_1_WGM0_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 3 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 10 - ThreadTileA: 12 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 2 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT12v4GrtcTZNWe-mJBLZOQK8ZgkEt3oCSxbpqbFQWBjaUQ= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 - LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 320 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG0 - SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT126VlbQ658zPSisyZEnBWlthGSRo43qt5FQgXLm-ubwkc= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 - LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 321 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU64_SUM2_SUS2048_SPO1_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG0 - SourceSwap: 1 - StaggerU: 64 - StaggerUMapping: 2 - StaggerUStride: 2048 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 4 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT127oFYVejTQePIZWvAUjUUnI73lakjtlRte8nZxn1BBQU= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 1 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.26 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: true - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB1_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA16_LPB0_LPM0_LRVW8_LWPM0p80_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS6_NLCA1_NLCB2_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16896 - LdsInitCVgprs: false - LdsNumBytes: 16896 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 0 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 16 - LdsPadB: 0 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.8 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 2 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 322 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB1_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA16_LPB0_LPM0_LRVW8_LWPM0p80_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS6_NLCA1_NLCB2_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM2_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 2 - StaggerUMapping: 2 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: true - tailLoopOptA: true - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT25DmIimxyKAym_8vtFOOxiflc3vmf6nh1ie9SgwrMwY_k= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.26 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO2_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 - LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59392 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 5] - MIWaveTileA: 8 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 160 - MacroTileA: 256 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 323 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO2_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 5 - ThreadTileA: 32 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT64DrZNKljEvvN0eo1NuZA0tiB7bUcTrvSeu7PZBaCu2vw= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.26 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p25_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13824 - LdsInitCVgprs: false - LdsNumBytes: 13824 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 25088 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13824 - LdsOffsetMetadata_Blk: 25088 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.25 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 324 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p25_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM4_SUS128_SPO1_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC4_WGMXCCG0 - SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 4 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT64jaPr1-PSHCqfGwOnG3gEB7CzVRLr02kxeILFbAALRbw= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p50_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 - LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.5 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 2 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 325 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p50_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG0 - SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 - StaggerUStride: 1024 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT64-PlEumzYOok0AB2P21dcx7aH80pW9X5pObp651vCOXg= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.25 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p25_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO2_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA1_WSGRB1_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 - LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 326 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p25_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM1_SUS1024_SPO1_SRVW0_SSO2_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 2 - StaggerUMapping: 1 - StaggerUStride: 1024 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 1 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 3 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT64jaPr1-PSHCqfGwOnG3gEB7CzVRLr02kxeILFbAALRbw= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.8 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p27_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 - LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.27 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 327 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p27_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG32 - SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 32 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT16-m281vS2vLvzMPs4Lafe_X5YwK2kHxyvNF3ZJL2ij6s= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 1 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.8 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPM0p20_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC7_NTD0_NTM0_NEPBS8_NLCA8_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB1_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 256 - LSPA: 16 - LSPB: 2 - LVCA: 4 - LVCB: 32 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 8704 - LdsInitCVgprs: false - LdsNumBytes: 8704 - LdsNumElementsAlignedA: 0 - LdsNumElementsAlignedB: 8704 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 0 - LdsOffsetB_Blk: 16384 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 16384 - LdsPadA: 0 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.2 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 7 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 8 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 328 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPM0p20_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC7_NTD0_NTM0_NEPBS8_NLCA8_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS512_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB1_WS64_WG16_4_1_WGM1_WGMXCC16_WGMXCCG0 - SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: true - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT25DHG7xTSClPbpv_I4DaDPsO0pMGVgoWabAfCIjqTh6sg= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 329 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG304 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: 304 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT12ejZsQ6ttuk5mgsoNge3MeHdy_vaVbcsIhvt7iaEG1RU= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.8 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA4_GRVWB2_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO3_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA1_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14208 - LdsInitCVgprs: false - LdsNumBytes: 14208 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 5504 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 25088 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14208 - LdsOffsetMetadata_Blk: 25088 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 330 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x80x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA4_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS64_SPO1_SRVW0_SSO3_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA1_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 2 - StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 3 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT25UChAdv-Stz5ZsTAadf_7wZ9kjnSgcIUzcxaRR0UVYJM= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 49408 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 49408 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 331 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 64 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT64JggGy9s21NxlKnuDHQQHlIOsEQbLiVh6K0eR0Q_I-yg= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.7 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p70_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 - LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 332 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p70_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU4_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC8_WGMXCCG0 - SourceSwap: 0 - StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 333 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT19NurFEvZgzJmp1-BrkbtTl2nhCiBoBegPf7n6Sz0C8SM= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 1 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.8 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: true - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB1_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA16_LPB0_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB2_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA2_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58880 - LdsInitCVgprs: false - LdsNumBytes: 58880 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 0 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 58880 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 58880 - LdsPadA: 16 - LdsPadB: 0 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [12, 5] - MIWaveTileA: 12 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 320 - MacroTileA: 192 - MacroTileB: 320 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 6 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 2 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 334 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB1_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA16_LPB0_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB2_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM4_SUS256_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA2_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 4 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 5 - ThreadTileA: 48 - ThreadTileB: 5 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: true - tailLoopOptA: true - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HAS_SAV_UserArgs_MT12Y2LWyY45KwUl8NahONXKp2n10XHogRMaJ0ea2f_hZqg= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.6 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p60_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA2_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 - LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 335 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p60_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG0 - SourceSwap: 1 - StaggerU: 2 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 20480 - LdsInitCVgprs: false - LdsNumBytes: 20480 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 37888 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 37888 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 96 - MacroTileA: 32 - MacroTileB: 96 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 1 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 336 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 8 - AssertSummationElementMultiple: 32 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 36864 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53760 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: -1 - LocalReadVectorWidth: 8 - LocalSplitU: 2 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 1 - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 337 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM0_WGMXCC1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 1024 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 338 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCG0 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: 0 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 - LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 339 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG304 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: 304 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x80x16_MI16x50HYW_CNIuM5cNezlM4-0NdBtkNRDpFtc64xhEXp_fs= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 16 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.6 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x80x16_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p60_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 11520 - LdsInitCVgprs: false - LdsNumBytes: 11520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 2816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 25088 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 11520 - LdsOffsetMetadata_Blk: 25088 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 16 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 80 - MacroTileA: 256 - MacroTileB: 80 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 340 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x80x16_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p60_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 64 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 16 - _DepthUA: 16 - _DepthUB: 16 - _DepthUMetadata: 16 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 8 - AssertSummationElementMultiple: 32 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_EPS0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumElements: 31872 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 15232 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 49408 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 31872 - LdsOffsetMetadata_Blk: 49408 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: -1 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 16 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 341 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_EPS0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_TLDS1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM40_WGMXCC1 - SourceSwap: true - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 14 - ThreadTileA: 16 - ThreadTileB: 14 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 40 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 256 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.2 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p20_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB2_WS64_WG64_4_1 - LSCA: 256 - LSCB: 256 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 - LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 8704 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 16 - LoopUnroll: 256 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - OptNoLoadLoop: 0 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 342 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p20_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB0_NTC2_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB2_WS64_WG64_4_1_WGM0_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 16 - StaggerUMapping: 1 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 2 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x192x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23936 - LdsInitCVgprs: false - LdsNumBytes: 23936 - LdsNumElementsAlignedA: 10880 - LdsNumElementsAlignedB: 13056 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 10880 - LdsOffsetB_Blk: 43648 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23936 - LdsOffsetMetadata_Blk: 43648 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 3] - MIWaveTileA: 10 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 192 - MacroTileA: 160 - MacroTileB: 192 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 120 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 10 - NumLoadsB: 12 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 12 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 343 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x192x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS64_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 64 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 3 - ThreadTileA: 40 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 - LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 344 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 - LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 345 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23936 - LdsInitCVgprs: false - LdsNumBytes: 23936 - LdsNumElementsAlignedA: 13056 - LdsNumElementsAlignedB: 10880 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13056 - LdsOffsetB_Blk: 45824 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23936 - LdsOffsetMetadata_Blk: 45824 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [3, 10] - MIWaveTileA: 3 - MIWaveTileB: 10 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 160 - MacroTileA: 192 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 120 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 12 - NumLoadsB: 10 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 12 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 346 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x160x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS64_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 64 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 10 - ThreadTileA: 12 - ThreadTileB: 10 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41472 - LdsInitCVgprs: false - LdsNumBytes: 41472 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41472 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 2] - MIWaveTileA: 10 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 5 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 347 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 2 - ThreadTileA: 40 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_3_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62976 - LdsInitCVgprs: false - LdsNumBytes: 62976 - LdsNumElementsAlignedA: 32256 - LdsNumElementsAlignedB: 30720 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32256 - LdsOffsetB_Blk: 97792 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 62976 - LdsOffsetMetadata_Blk: 97792 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [14, 3] - MIWaveTileA: 14 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 192 - MacroTileA: 224 - MacroTileB: 192 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 168 - NumGlobalWriteVectorsPerThread: 84 - NumLoadsA: 7 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 348 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT224x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT14_3_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 56 - ThreadTile1: 3 - ThreadTileA: 56 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55296 - LdsInitCVgprs: false - LdsNumBytes: 55296 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55296 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 6] - MIWaveTileA: 6 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 349 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 6 - ThreadTileA: 24 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 - LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 350 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 - LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 351 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 - LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 352 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 128 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 - LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 25600 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 353 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 - LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 91136 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 354 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 355 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 4 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 - LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 26112 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 356 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU4_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 4] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 - LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 2] - MIWaveTileA: 8 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 357 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - tailLoopOpt: true - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x320x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28288 - LdsInitCVgprs: false - LdsNumBytes: 28288 - LdsNumElementsAlignedA: 6528 - LdsNumElementsAlignedB: 21760 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 6528 - LdsOffsetB_Blk: 39296 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 28288 - LdsOffsetMetadata_Blk: 39296 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 5] - MIWaveTileA: 6 - MIWaveTileB: 5 - MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 320 - MacroTileA: 96 - MacroTileB: 320 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 120 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 6 - NumLoadsB: 20 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 20 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 358 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT96x320x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS64_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 1 - StaggerUStride: 64 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 5 - ThreadTileA: 24 - ThreadTileB: 5 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - tailLoopOpt: true + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - [2, 3, 0, 1] - - - [2048, 64896, 1, 512] - - [0, 0.0] + - [18, 0.0] - - [2048, 17556, 1, 512] - - [63, 0.0] + - [297, 0.0] - - [512, 14, 1, 256] - - [1, 0.0] + - [146, 0.0] - - [2048, 16302, 1, 512] - - [63, 0.0] + - [157, 0.0] - - [512, 13, 1, 256] - - [2, 0.0] + - [146, 0.0] - - [8192, 3500, 1, 2048] - - [3, 0.0] + - [18, 0.0] - - [512, 3500, 1, 512] - - [4, 0.0] + - [94, 0.0] - - [2048, 17052, 1, 512] - - [63, 0.0] + - [297, 0.0] - - [262144, 13, 1, 256] - - [5, 0.0] + - [262, 0.0] - - [256, 14, 1, 2560] - - [6, 0.0] + - [77, 0.0] - - [262144, 14, 1, 256] - - [7, 0.0] + - [262, 0.0] - - [8192, 3250, 1, 2048] - - [8, 0.0] + - [112, 0.0] - - [256, 13, 1, 2560] - - [9, 0.0] + - [286, 0.0] - - [512, 3250, 1, 512] - - [10, 0.0] + - [176, 0.0] - - [256, 14, 1, 256] - - [11, 0.0] + - [146, 0.0] - - [2048, 1024, 1, 5560] - - [12, 0.0] + - [296, 0.0] - - [1, 1024, 1, 256] - - [13, 0.0] + - [109, 0.0] - - [512, 1024, 1, 5744] - - [14, 0.0] + - [295, 0.0] - - [128, 1024, 1, 10624] - - [15, 0.0] + - [7, 0.0] - - [512, 1024, 1, 5560] - - [16, 0.0] + - [295, 0.0] - - [1024, 1024, 1, 3840] - - [17, 0.0] + - [294, 0.0] - - [1024, 1024, 1, 2872] - - [18, 0.0] + - [245, 0.0] - - [2872, 1024, 1, 512] - - [19, 0.0] + - [114, 0.0] - - [1, 1024, 1, 128] - - [20, 0.0] + - [109, 0.0] - - [128, 1024, 1, 7040] - - [21, 0.0] + - [160, 0.0] - - [128, 1024, 1, 4416] - - [22, 0.0] + - [95, 0.0] - - [128, 1024, 1, 6784] - - [23, 0.0] + - [7, 0.0] - - [512, 1024, 1, 2872] - - [24, 0.0] + - [293, 0.0] - - [256, 1024, 1, 4608] - - [25, 0.0] + - [55, 0.0] - - [512, 1024, 1, 3072] - - [26, 0.0] + - [292, 0.0] - - [5560, 1024, 1, 2780] - - [27, 0.0] + - [291, 0.0] - - [2780, 1024, 1, 5560] - - [28, 0.0] + - [259, 0.0] - - [512, 16, 1, 256] - - [95, 0.0] + - [50, 0.0] - - [512, 32, 1, 256] - - [1, 0.0] + - [274, 0.0] - - [8192, 4000, 1, 2048] - - [114, 0.0] + - [202, 0.0] - - [512, 4000, 1, 512] - - [29, 0.0] + - [290, 0.0] - - [8192, 8000, 1, 2048] - - [30, 0.0] + - [202, 0.0] - - [512, 8000, 1, 512] - - [31, 0.0] + - [238, 0.0] - - [262144, 16, 1, 256] - - [32, 0.0] + - [262, 0.0] - - [256, 16, 1, 2560] - - [9, 0.0] + - [286, 0.0] - - [262144, 32, 1, 256] - - [33, 0.0] + - [275, 0.0] - - [256, 32, 1, 2560] - - [34, 0.0] + - [77, 0.0] - - [256, 13, 1, 256] - - [35, 0.0] + - [60, 0.0] - - [2048, 2048, 1, 5560] - - [36, 0.0] + - [22, 0.0] - - [1, 2048, 1, 256] - - [20, 0.0] + - [146, 0.0] - - [512, 2048, 1, 5744] - - [37, 0.0] + - [17, 0.0] - - [128, 2048, 1, 10624] - - [38, 0.0] + - [84, 0.0] - - [512, 2048, 1, 5560] - - [39, 0.0] + - [17, 0.0] - - [1024, 2048, 1, 3840] - - [40, 0.0] + - [37, 0.0] - - [1024, 2048, 1, 2872] - - [41, 0.0] + - [289, 0.0] - - [2872, 2048, 1, 512] - - [42, 0.0] + - [110, 0.0] - - [1, 2048, 1, 128] - - [43, 0.0] + - [146, 0.0] - - [128, 2048, 1, 7040] - - [44, 0.0] + - [84, 0.0] - - [128, 2048, 1, 4416] - - [45, 0.0] + - [55, 0.0] - - [128, 2048, 1, 6784] - - [46, 0.0] + - [84, 0.0] - - [512, 2048, 1, 2872] - - [47, 0.0] + - [20, 0.0] - - [256, 2048, 1, 4608] - - [48, 0.0] + - [13, 0.0] - - [512, 2048, 1, 3072] - - [49, 0.0] + - [153, 0.0] - - [2048, 4096, 1, 5560] - - [50, 0.0] + - [74, 0.0] - - [1, 4096, 1, 256] - - [20, 0.0] + - [146, 0.0] - - [512, 4096, 1, 5744] - - [51, 0.0] + - [92, 0.0] - - [128, 4096, 1, 10624] - - [52, 0.0] + - [17, 0.0] - - [512, 4096, 1, 5560] - - [63, 0.0] + - [92, 0.0] - - [1024, 4096, 1, 3840] - - [53, 0.0] + - [216, 0.0] - - [1024, 4096, 1, 2872] - - [54, 0.0] + - [235, 0.0] - - [2872, 4096, 1, 512] - - [63, 0.0] + - [12, 0.0] - - [1, 4096, 1, 128] - - [13, 0.0] + - [146, 0.0] - - [128, 4096, 1, 7040] - - [55, 0.0] + - [84, 0.0] - - [128, 4096, 1, 4416] - - [56, 0.0] + - [13, 0.0] - - [128, 4096, 1, 6784] - - [57, 0.0] + - [20, 0.0] - - [512, 4096, 1, 2872] - - [58, 0.0] + - [28, 0.0] - - [256, 4096, 1, 4608] - - [59, 0.0] + - [20, 0.0] - - [512, 4096, 1, 3072] - - [60, 0.0] + - [92, 0.0] - - [2048, 152710, 1, 512] - - [61, 0.0] + - [288, 0.0] - - [256, 16, 1, 256] - - [62, 0.0] + - [60, 0.0] - - [256, 32, 1, 256] - - [2, 0.0] + - [60, 0.0] - - [4000, 2, 1, 8192] - - [64, 0.0] + - [109, 0.0] - - [4000, 4, 1, 8192] - - [64, 0.0] + - [109, 0.0] - - [4000, 8, 1, 8192] - - [65, 0.0] + - [109, 0.0] - - [4000, 16, 1, 8192] - - [66, 0.0] + - [109, 0.0] - - [4000, 32, 1, 8192] - - [67, 0.0] + - [273, 0.0] - - [4000, 64, 1, 8192] - - [68, 0.0] + - [287, 0.0] - - [4000, 96, 1, 8192] - - [69, 0.0] + - [223, 0.0] - - [1280, 2, 1, 8192] - - [70, 0.0] + - [286, 0.0] - - [1280, 16, 1, 8192] - - [71, 0.0] + - [280, 0.0] - - [1280, 32, 1, 8192] - - [72, 0.0] + - [232, 0.0] - - [1280, 64, 1, 8192] - - [73, 0.0] + - [285, 0.0] - - [1280, 96, 1, 8192] - - [74, 0.0] + - [244, 0.0] - - [8192, 2, 1, 1024] - - [75, 0.0] + - [284, 0.0] - - [8192, 4, 1, 1024] - - [76, 0.0] + - [138, 0.0] - - [8192, 8, 1, 1024] - - [77, 0.0] + - [284, 0.0] - - [8192, 16, 1, 1024] - - [78, 0.0] + - [283, 0.0] - - [8192, 32, 1, 1024] - - [79, 0.0] + - [282, 0.0] - - [8192, 64, 1, 1024] - - [80, 0.0] + - [63, 0.0] - - [8192, 96, 1, 1024] - - [81, 0.0] + - [277, 0.0] - - [7168, 32, 1, 8192] - - [82, 0.0] + - [281, 0.0] - - [8192, 2, 1, 3584] - - [83, 0.0] + - [43, 0.0] - - [8192, 4, 1, 3584] - - [84, 0.0] + - [43, 0.0] - - [8192, 8, 1, 3584] - - [85, 0.0] + - [279, 0.0] - - [8192, 16, 1, 3584] - - [86, 0.0] + - [279, 0.0] - - [8192, 32, 1, 3584] - - [87, 0.0] + - [15, 0.0] - - [8192, 96, 1, 3584] - - [88, 0.0] + - [261, 0.0] - - [1280, 4, 1, 8192] - - [89, 0.0] + - [280, 0.0] - - [1280, 8, 1, 8192] - - [89, 0.0] + - [280, 0.0] - - [7168, 2, 1, 8192] - - [90, 0.0] + - [278, 0.0] - - [7168, 4, 1, 8192] - - [91, 0.0] + - [279, 0.0] - - [7168, 8, 1, 8192] - - [91, 0.0] + - [278, 0.0] - - [7168, 16, 1, 8192] - - [91, 0.0] + - [278, 0.0] - - [7168, 64, 1, 8192] - - [92, 0.0] + - [143, 0.0] - - [7168, 96, 1, 8192] - - [93, 0.0] + - [277, 0.0] - - [8192, 64, 1, 3584] - - [94, 0.0] + - [13, 0.0] - - [61440, 16, 1, 256] - - [117, 0.0] + - [262, 0.0] - - [256, 27, 1, 256] - - [96, 0.0] + - [60, 0.0] - - [256, 27, 1, 2560] - - [97, 0.0] + - [276, 0.0] - - [512, 27, 1, 256] - - [98, 0.0] + - [274, 0.0] - - [61440, 27, 1, 256] - - [99, 0.0] + - [275, 0.0] - - [262144, 27, 1, 256] - - [100, 0.0] + - [275, 0.0] - - [512, 33, 1, 256] - - [101, 0.0] + - [274, 0.0] - - [8, 4000, 1, 2048] - - [102, 0.0] + - [43, 0.0] - - [512, 4000, 1, 384] - - [103, 0.0] + - [98, 0.0] - - [6, 5940, 1, 512] - - [104, 0.0] + - [67, 0.0] - - [8, 5940, 1, 2048] - - [105, 0.0] + - [273, 0.0] - - [11, 5940, 1, 512] - - [104, 0.0] + - [272, 0.0] - - [15, 5940, 1, 32] - - [106, 0.0] + - [175, 0.0] - - [32, 5940, 1, 64] - - [107, 0.0] + - [271, 0.0] - - [35, 5940, 1, 128] - - [108, 0.0] + - [270, 0.0] - - [64, 5940, 1, 128] - - [109, 0.0] + - [269, 0.0] - - [128, 5940, 1, 128] - - [110, 0.0] + - [67, 0.0] - - [128, 5940, 1, 548] - - [111, 0.0] + - [49, 0.0] - - [384, 5940, 1, 513] - - [112, 0.0] + - [268, 0.0] - - [512, 5940, 1, 384] - - [115, 0.0] + - [238, 0.0] - - [512, 5940, 1, 512] - - [113, 0.0] + - [94, 0.0] - - [8192, 5940, 1, 2048] - - [118, 0.0] + - [202, 0.0] - - [512, 68850, 1, 516] - - [116, 0.0] + - [267, 0.0] - - [208448, 1792, 1, 1024] - - [119, 0.0] + - [210, 0.0] - - [20352, 1792, 1, 1024] - - [120, 0.0] + - [266, 0.0] - - [200, 128, 1792, 384] - - [121, 0.0] + - [207, 0.0] - - [208448, 1792, 1, 256] - - [122, 0.0] + - [265, 0.0] - - [2048, 1792, 1, 208448] - - [124, 0.0] + - [264, 0.0] - - [256, 1792, 1, 49152] - - [123, 0.0] + - [263, 0.0] - - [2304, 2, 1, 16384] - - [125, 0.0] + - [109, 0.0] - - [13312, 2, 1, 16384] - - [126, 0.0] + - [187, 0.0] - - [16384, 2, 1, 2048] - - [127, 0.0] + - [262, 0.0] - - [16384, 2, 1, 6656] - - [131, 0.0] + - [261, 0.0] - - [128256, 2, 1, 16384] - - [128, 0.0] + - [187, 0.0] - - [13312, 8192, 1, 16384] - - [130, 0.0] + - [154, 0.0] - - [16384, 8192, 1, 2048] - - [129, 0.0] + - [183, 0.0] - - [2304, 8192, 1, 16384] - - [130, 0.0] + - [260, 0.0] - - [16384, 8192, 1, 6656] - - [130, 0.0] + - [19, 0.0] - - [128256, 8192, 1, 16384] - - [130, 0.0] + - [190, 0.0] - - [2048, 1024, 1, 208448] - - [132, 0.0] + - [166, 0.0] - - [384, 204800, 1, 384] - - [133, 0.0] + - [208, 0.0] - - [26496, 1024, 1, 1024] - - [134, 0.0] + - [210, 0.0] - - [2048, 1024, 1, 8224] - - [135, 0.0] + - [259, 0.0] - - [20352, 1024, 1, 1024] - - [136, 0.0] + - [147, 0.0] - - [24448, 1024, 1, 1024] - - [137, 0.0] + - [120, 0.0] - - [208448, 1024, 1, 1024] - - [138, 0.0] + - [210, 0.0] - - [1024, 1024, 1, 24448] - - [139, 0.0] + - [258, 0.0] - - [1024, 1024, 1, 26496] - - [140, 0.0] + - [257, 0.0] - - [1024, 1024, 1, 22400] - - [141, 0.0] + - [21, 0.0] - - [1024, 1024, 1, 20352] - - [142, 0.0] + - [21, 0.0] - - [1024, 1024, 1, 18304] - - [143, 0.0] + - [21, 0.0] - - [1024, 1024, 1, 16256] - - [144, 0.0] + - [256, 0.0] - - [16032, 8192, 1, 16384] - - [130, 0.0] + - [161, 0.0] - - [2048, 1152, 1, 2048] - - [229, 365271.0] + - [255, 0.0] - - [2048, 462, 1, 4096] - - [145, 238481.0] + - [254, 0.0] - - [16384, 1152, 1, 2048] - - [223, 482015.0] + - [253, 0.0] - - [2048, 1152, 1, 8192] - - [224, 421424.0] + - [252, 0.0] - - [8192, 4608, 1, 1024] - - [161, 502189.0] + - [214, 0.0] - - [1024, 4608, 1, 1024] - - [226, 382191.0] + - [235, 0.0] - - [1024, 462, 1, 4096] - - [184, 176348.0] + - [241, 0.0] - - [1024, 4608, 1, 4096] - - [225, 404145.0] + - [235, 0.0] - - [2048, 462, 1, 768] - - [146, 153261.0] + - [251, 0.0] - - [4096, 18432, 1, 512] - - [162, 447793.0] + - [250, 0.0] - - [512, 18432, 1, 512] - - [227, 350140.0] + - [249, 0.0] - - [2048, 308, 1, 1472] - - [147, 125200.0] + - [248, 0.0] - - [16384, 576, 1, 2048] - - [228, 400355.0] + - [247, 0.0] - - [512, 18432, 1, 2048] - - [163, 427482.0] + - [246, 0.0] - - [2048, 576, 1, 2048] - - [148, 240993.0] + - [245, 0.0] - - [512, 462, 1, 4096] - - [149, 118621.0] + - [244, 0.0] - - [1024, 308, 1, 1472] - - [165, 80012.0] + - [243, 0.0] - - [512, 308, 1, 1472] - - [165, 46073.5] + - [55, 0.0] - - [16384, 231, 1, 4096] - - [150, 296176.0] + - [5, 0.0] - - [4096, 231, 1, 4096] - - [166, 227144.0] + - [242, 0.0] - - [8192, 2304, 1, 1024] - - [164, 487939.0] + - [214, 0.0] - - [2048, 231, 1, 4096] - - [172, 172069.0] + - [241, 0.0] - - [1024, 231, 1, 4096] - - [173, 131593.0] + - [224, 0.0] - - [1024, 2304, 1, 1024] - - [151, 281901.0] + - [240, 0.0] - - [4096, 9216, 1, 512] - - [162, 434990.0] + - [239, 0.0] - - [1024, 2304, 1, 4096] - - [152, 358643.0] + - [30, 0.0] - - [512, 9216, 1, 512] - - [176, 263290.0] + - [238, 0.0] - - [16384, 288, 1, 2048] - - [174, 322408.0] + - [237, 0.0] - - [1024, 154, 1, 1472] - - [167, 46054.0] + - [236, 0.0] - - [2048, 576, 1, 8192] - - [153, 322361.0] + - [0, 0.0] - - [512, 462, 1, 768] - - [175, 65492.7] + - [55, 0.0] - - [512, 9216, 1, 2048] - - [154, 346597.0] + - [235, 0.0] - - [1024, 462, 1, 768] - - [171, 108200.0] + - [234, 0.0] - - [2048, 231, 1, 768] - - [177, 103786.0] + - [233, 0.0] - - [2048, 154, 1, 1472] - - [168, 78881.1] + - [234, 0.0] - - [2048, 288, 1, 2048] - - [178, 163245.0] + - [233, 0.0] - - [512, 154, 1, 1472] - - [169, 25990.9] + - [232, 0.0] - - [3584, 154, 1, 1472] - - [155, 112584.0] + - [231, 0.0] - - [16384, 144, 1, 2048] - - [156, 226469.0] + - [230, 0.0] - - [1472, 154, 1, 3584] - - [157, 96964.5] + - [229, 0.0] - - [384, 154, 1, 1472] - - [169, 20044.6] + - [225, 0.0] - - [512, 231, 1, 768] - - [179, 36458.9] + - [228, 0.0] - - [3072, 77, 1, 768] - - [158, 58715.4] + - [223, 0.0] - - [1472, 154, 1, 384] - - [170, 31973.7] + - [54, 0.0] - - [768, 77, 1, 3072] - - [180, 43451.9] + - [227, 0.0] - - [512, 231, 1, 4096] - - [181, 82791.5] + - [226, 0.0] - - [768, 77, 1, 768] - - [180, 19256.8] + - [225, 0.0] - - [1024, 231, 1, 768] - - [159, 62472.2] + - [224, 0.0] - - [2048, 144, 1, 2048] - - [182, 109043.0] + - [223, 0.0] - - [2048, 144, 1, 8192] - - [183, 160947.0] + - [222, 0.0] - - [2048, 288, 1, 8192] - - [160, 236383.0] + - [81, 0.0] - - [128, 409600, 1, 384] - - [185, 0.0] + - [208, 0.0] - - [256, 2048, 1, 49152] - - [203, 0.0] + - [221, 0.0] - - [384, 393216, 1, 257] - - [186, 0.0] + - [220, 0.0] - - [384, 409600, 1, 128] - - [187, 0.0] + - [220, 0.0] - - [384, 409600, 1, 384] - - [188, 0.0] + - [219, 0.0] - - [641, 393216, 1, 6514] - - [189, 0.0] + - [218, 0.0] - - [1024, 2048, 1, 10112] - - [190, 0.0] + - [11, 0.0] - - [1024, 2048, 1, 12160] - - [191, 0.0] + - [11, 0.0] - - [1024, 2048, 1, 14208] - - [192, 0.0] + - [216, 0.0] - - [1024, 2048, 1, 16256] - - [214, 0.0] + - [2, 0.0] - - [1024, 2048, 1, 18304] - - [193, 0.0] + - [217, 0.0] - - [1024, 2048, 1, 20352] - - [204, 0.0] + - [217, 0.0] - - [1024, 2048, 1, 22400] - - [205, 0.0] + - [217, 0.0] - - [1024, 2048, 1, 24448] - - [206, 0.0] + - [217, 0.0] - - [1024, 2048, 1, 26496] - - [194, 0.0] + - [216, 0.0] - - [2048, 2048, 1, 8224] - - [207, 0.0] + - [33, 0.0] - - [2048, 2048, 1, 208448] - - [195, 0.0] + - [166, 0.0] - - [2304, 2048, 1, 4800] - - [208, 0.0] + - [8, 0.0] - - [4800, 2048, 1, 2304] - - [209, 0.0] + - [215, 0.0] - - [4800, 2048, 1, 22400] - - [196, 0.0] + - [74, 0.0] - - [14208, 2048, 1, 1024] - - [210, 0.0] + - [211, 0.0] - - [16256, 2048, 1, 1024] - - [216, 0.0] + - [154, 0.0] - - [18304, 2048, 1, 1024] - - [215, 0.0] + - [214, 0.0] - - [20352, 2048, 1, 1024] - - [211, 0.0] + - [100, 0.0] - - [22400, 2048, 1, 1024] - - [197, 0.0] + - [214, 0.0] - - [24448, 2048, 1, 1024] - - [198, 0.0] + - [213, 0.0] - - [24576, 2048, 1, 4800] - [212, 0.0] - - [26496, 2048, 1, 1024] - - [213, 0.0] + - [211, 0.0] - - [208448, 2048, 1, 1024] - - [130, 0.0] + - [210, 0.0] - - [256, 401179, 1, 512] - - [199, 0.0] + - [209, 0.0] - - [384, 331051, 1, 256] - - [200, 0.0] + - [208, 0.0] - - [384, 365802, 1, 256] - - [200, 0.0] + - [208, 0.0] - - [384, 395225, 1, 256] - - [201, 0.0] + - [208, 0.0] - - [384, 401179, 1, 256] - - [202, 0.0] + - [208, 0.0] - - [512, 331051, 1, 1536] - - [217, 0.0] + - [9, 0.0] - - [512, 395225, 1, 1024] - - [218, 0.0] + - [9, 0.0] - - [200, 128, 3072, 384] - - [219, 0.0] + - [207, 0.0] - - [200, 128, 4096, 384] - - [219, 0.0] + - [56, 0.0] - - [200, 128, 2048, 384] - - [219, 0.0] + - [207, 0.0] - - [200, 128, 1024, 384] - - [219, 0.0] + - [207, 0.0] - - [200, 128, 512, 384] - - [219, 0.0] + - [50, 0.0] - - [200, 128, 256, 384] - - [219, 0.0] + - [207, 0.0] - - [200, 128, 128, 384] - - [219, 0.0] + - [206, 0.0] - - [192, 257, 2048, 32] - - [220, 0.0] + - [205, 0.0] - - [192, 6514, 2048, 32] - - [221, 0.0] + - [205, 0.0] - - [6514, 192, 2048, 32] - - [222, 0.0] + - [173, 0.0] - - [1600, 43008, 1, 1600] - - [230, 0.0] + - [204, 0.0] - - [4096, 24576, 1, 4096] - - [231, 0.0] + - [167, 0.0] - - [50304, 51200, 1, 1600] - - [232, 0.0] + - [166, 0.0] - - [1600, 43008, 1, 6400] - - [271, 0.0] + - [201, 0.0] - - [6144, 24576, 1, 4096] - - [233, 0.0] + - [195, 0.0] - - [6400, 43008, 1, 1600] - - [234, 0.0] + - [185, 0.0] - - [4096, 24576, 1, 14336] - - [235, 0.0] + - [112, 0.0] - - [4800, 43008, 1, 1600] - - [236, 0.0] + - [203, 0.0] - - [14336, 24576, 1, 4096] - - [237, 0.0] + - [167, 0.0] - - [32000, 24576, 1, 4096] - - [238, 0.0] + - [141, 0.0] - - [50304, 43008, 1, 1600] - - [239, 0.0] + - [166, 0.0] - - [128256, 57344, 1, 8192] - - [296, 0.0] + - [190, 0.0] - - [128256, 61440, 1, 8192] - - [240, 302.55] + - [141, 0.0] - - [28672, 57344, 1, 8192] - - [262, 0.0] + - [161, 0.0] - - [28672, 61440, 1, 8192] - - [241, 297.67] + - [194, 0.0] - - [128256, 20480, 1, 4096] - - [258, 0.0] + - [161, 0.0] - - [14336, 20480, 1, 4096] - - [267, 0.0] + - [167, 0.0] - - [4096, 20480, 1, 4096] - - [268, 0.0] + - [112, 0.0] - - [14336, 16384, 1, 4096] - - [242, 0.0] + - [161, 0.0] - - [6144, 8192, 1, 4096] - - [243, 0.0] + - [183, 0.0] - - [32000, 8192, 1, 4096] - - [244, 0.0] + - [18, 0.0] - - [4096, 32768, 1, 14336] - - [274, 368.59] + - [167, 0.0] - - [6144, 16384, 1, 4096] - - [245, 0.0] + - [167, 0.0] - - [14336, 8192, 1, 4096] - - [257, 0.0] + - [167, 0.0] - - [32000, 16384, 1, 4096] - - [246, 0.0] + - [18, 0.0] - - [4096, 8192, 1, 4096] - - [247, 0.0] + - [183, 0.0] - - [4096, 8192, 1, 14336] - - [248, 0.0] + - [202, 0.0] - - [4096, 16384, 1, 4096] - - [249, 0.0] + - [183, 0.0] - - [4096, 16384, 1, 14336] - - [250, 0.0] + - [112, 0.0] - - [6144, 32768, 1, 4096] - - [279, 365.45] + - [195, 0.0] - - [14336, 32768, 1, 4096] - - [281, 376.58] + - [167, 0.0] - - [4096, 32768, 1, 4096] - - [277, 368.62] + - [167, 0.0] - - [1600, 8192, 1, 1600] - - [251, 0.0] + - [201, 0.0] - - [4800, 8192, 1, 1600] - - [252, 0.0] + - [27, 0.0] - - [6400, 8192, 1, 1600] - - [253, 0.0] + - [163, 0.0] - - [4096, 4096, 1, 14336] - - [254, 0.0] + - [183, 0.0] - - [8192, 12288, 1, 8192] - - [255, 0.0] + - [161, 0.0] - - [8192, 12288, 1, 28672] - - [256, 0.0] + - [18, 0.0] - - [10240, 12288, 1, 8192] - - [255, 0.0] + - [192, 0.0] - - [28672, 12288, 1, 8192] - - [258, 0.0] + - [161, 0.0] - - [50304, 8192, 1, 1600] - - [259, 0.0] + - [166, 0.0] - - [50304, 10240, 1, 1600] - - [260, 0.0] + - [166, 0.0] - - [128256, 4096, 1, 4096] - - [261, 0.0] + - [193, 0.0] - - [128256, 12288, 1, 8192] - - [262, 0.0] + - [161, 0.0] - - [6400, 6144, 1, 1600] - - [252, 0.0] + - [18, 0.0] - - [4800, 6144, 1, 1600] - - [263, 0.0] + - [200, 0.0] - - [50304, 6144, 1, 1600] - - [264, 0.0] + - [166, 0.0] - - [1600, 6144, 1, 1600] - - [265, 0.0] + - [199, 0.0] - - [1600, 6144, 1, 6400] - - [266, 0.0] + - [198, 0.0] - - [6144, 20480, 1, 4096] - - [267, 0.0] + - [195, 0.0] - - [4096, 20480, 1, 14336] - - [269, 0.0] + - [167, 0.0] - - [128256, 53248, 1, 8192] - - [261, 0.0] + - [190, 0.0] - - [28672, 53248, 1, 8192] - - [270, 0.0] + - [161, 0.0] - - [6400, 63488, 1, 1600] - - [272, 360.76] + - [185, 0.0] - - [1600, 63488, 1, 1600] - - [273, 324.89] + - [197, 0.0] - - [1600, 63488, 1, 6400] - - [274, 331.57] + - [197, 0.0] - - [4096, 28672, 1, 4096] - - [275, 355.96] + - [167, 0.0] - - [4096, 28672, 1, 14336] - - [276, 360.79] + - [112, 0.0] - - [4800, 63488, 1, 1600] - - [278, 339.58] + - [196, 0.0] - - [6144, 28672, 1, 4096] - - [275, 377.33] + - [195, 0.0] - - [8192, 69632, 1, 8192] - - [279, 390.72] + - [192, 0.0] - - [8192, 69632, 1, 28672] - - [280, 317.49] + - [18, 0.0] - - [10240, 69632, 1, 8192] - - [275, 397.78] + - [192, 0.0] - - [14336, 28672, 1, 4096] - - [278, 376.79] + - [167, 0.0] - - [28672, 69632, 1, 8192] - - [277, 333.44] + - [194, 0.0] - - [32000, 32768, 1, 4096] - - [279, 339.84] + - [141, 0.0] - - [50304, 63488, 1, 1600] - - [282, 354.53] + - [166, 0.0] - - [50304, 75776, 1, 1600] - - [283, 348.84] + - [166, 0.0] - - [128256, 28672, 1, 4096] - - [279, 322.79] + - [161, 0.0] - - [128256, 69632, 1, 8192] - - [284, 340.94] + - [190, 0.0] - - [1024, 40960, 1, 8192] - - [285, 0.0] + - [9, 0.0] - - [1024, 49152, 1, 8192] - - [286, 0.0] + - [189, 0.0] - - [28672, 40960, 1, 8192] - - [262, 0.0] + - [161, 0.0] - - [128256, 49152, 1, 8192] - - [287, 0.0] + - [190, 0.0] - - [8192, 40960, 1, 8192] - - [130, 0.0] + - [192, 0.0] - - [8192, 49152, 1, 28672] - - [288, 0.0] + - [18, 0.0] - - [1024, 57344, 1, 8192] - - [289, 0.0] + - [9, 0.0] - - [8192, 57344, 1, 28672] - - [130, 0.0] + - [193, 0.0] - - [28672, 49152, 1, 8192] - - [262, 0.0] + - [161, 0.0] - - [128256, 24576, 1, 8192] - - [290, 0.0] + - [190, 0.0] - - [8192, 57344, 1, 8192] - - [291, 0.0] + - [192, 0.0] - - [8192, 24576, 1, 28672] - - [288, 0.0] + - [18, 0.0] - - [28672, 24576, 1, 8192] - - [262, 0.0] + - [161, 0.0] - - [8192, 32768, 1, 8192] - - [292, 0.0] + - [161, 0.0] - - [128256, 32768, 1, 8192] - - [293, 0.0] + - [190, 0.0] - - [1024, 24576, 1, 8192] - - [294, 0.0] + - [191, 0.0] - - [8192, 24576, 1, 8192] - - [295, 0.0] + - [167, 0.0] - - [28672, 32768, 1, 8192] - - [262, 0.0] + - [161, 0.0] - - [8192, 49152, 1, 8192] - - [297, 0.0] + - [167, 0.0] - - [128256, 40960, 1, 8192] - - [298, 0.0] + - [190, 0.0] - - [8192, 32768, 1, 28672] - - [288, 0.0] + - [18, 0.0] - - [1024, 32768, 1, 8192] - - [299, 0.0] + - [189, 0.0] - - [8192, 40960, 1, 28672] - - [130, 0.0] + - [18, 0.0] - - [6144, 792, 1, 19648] - - [300, 0.0] + - [188, 0.0] - - [7680, 8, 1, 6144] - - [301, 0.0] + - [187, 0.0] - - [10576, 12288, 1, 2560] - - [302, 299.53] + - [161, 0.0] - - [50288, 12288, 1, 2560] - - [244, 319.09] + - [141, 0.0] - - [222, 5905, 1, 1024] - - [303, 0.0] + - [105, 0.0] - - [256, 5905, 1, 128] - - [304, 0.0] + - [91, 0.0] - - [1024, 5905, 1, 6144] - - [305, 0.0] + - [186, 0.0] - - [4096, 5905, 1, 6144] - - [306, 0.0] + - [154, 0.0] - - [1024, 5905, 1, 11715] - - [307, 0.0] + - [186, 0.0] - - [2048, 5905, 1, 5120] - - [308, 0.0] + - [154, 0.0] - - [11715, 5905, 1, 1024] - - [309, 0.0] + - [167, 0.0] - - [512, 5905, 1, 512] - - [310, 0.0] + - [94, 0.0] - - [256, 5905, 1, 1024] - - [311, 0.0] + - [105, 0.0] - - [2048, 5905, 1, 1475] - - [312, 0.0] + - [167, 0.0] - - [2048, 5905, 1, 4440] - - [313, 0.0] + - [179, 0.0] - - [512, 5905, 1, 1475] - - [314, 0.0] + - [12, 0.0] - - [256, 795008, 1, 256] - - [315, 0.0] + - [185, 0.0] - - [1024, 5905, 1, 5120] - - [316, 0.0] + - [184, 0.0] - - [24, 1511680, 1, 224] - - [317, 0.0] + - [45, 0.0] - - [5120, 5905, 1, 4096] - - [318, 0.0] + - [183, 0.0] - - [1475, 5905, 1, 1024] - - [319, 0.0] + - [182, 0.0] - - [512, 5905, 1, 256] - - [320, 0.0] + - [94, 0.0] - - [1024, 5905, 1, 1475] - - [321, 0.0] + - [181, 0.0] - - [11144, 5905, 1, 128] - - [322, 0.0] + - [180, 0.0] - - [2048, 5905, 1, 4040] - - [323, 0.0] + - [179, 0.0] - - [256, 5905, 1, 64] - - [324, 0.0] + - [91, 0.0] - - [202, 5905, 1, 1024] - - [325, 0.0] + - [178, 0.0] - - [128, 5905, 1, 1475] - - [326, 0.0] + - [177, 0.0] - - [256, 5905, 1, 512] - - [327, 0.0] + - [176, 0.0] - - [8, 5905, 1, 512] - - [328, 0.0] + - [175, 0.0] - - [256, 795008, 1, 512] - - [329, 0.0] + - [174, 0.0] - - [1475, 5905, 1, 128] - - [330, 0.0] + - [46, 0.0] - - [256, 795008, 1, 192] - - [331, 0.0] + - [173, 0.0] - - [128, 5905, 1, 1024] - - [332, 0.0] + - [71, 0.0] - - [5120, 5905, 1, 11715] - - [333, 0.0] + - [172, 0.0] - - [6144, 5905, 1, 1024] - - [334, 0.0] + - [171, 0.0] - - [1024, 5905, 1, 512] - - [335, 0.0] + - [170, 0.0] - - [20, 1511680, 1, 202] - - [336, 0.0] + - [45, 0.0] - - [5120, 1, 1, 5120] - - [83, 0.0] + - [169, 0.0] - - [4096, 1024, 1, 10240] - - [337, 0.0] + - [74, 0.0] - - [16384, 15683, 1, 256] - - [338, 0.0] + - [168, 0.0] - - [5120, 21504, 1, 5120] - - [130, 0.0] + - [167, 0.0] - - [6144, 90112, 1, 6144] - - [130, 0.0] + - [161, 0.0] - - [5120, 21504, 1, 13824] - - [130, 0.0] + - [112, 0.0] - - [6144, 180224, 1, 16384] - - [130, 0.0] + - [166, 0.0] - - [6144, 90112, 1, 1024] - - [130, 0.0] + - [18, 0.0] - - [512, 81, 1, 512] - - [180, 0.0] + - [109, 0.0] - - [5120, 672, 1, 768] - - [339, 0.0] + - [114, 0.0] - - [6144, 90112, 1, 8] - - [340, 0.0] + - [165, 0.0] - - [1152, 65536, 1, 1152] - - [236, 0.0] + - [164, 0.0] - - [4304, 65536, 1, 1152] - - [341, 0.0] + - [163, 0.0] - - [3072, 17, 1, 768] - - [179, 0.0] + - [162, 0.0] - - [3072, 26112, 1, 15360] - - [294, 0.0] + - [161, 0.0] - - [3072, 17, 1, 256] - - [98, 0.0] + - [109, 0.0] - - [9216, 17, 1, 3072] - - [94, 0.0] + - [15, 0.0] - - [18432, 17, 1, 3072] - - [94, 0.0] + - [160, 0.0] - - [160128, 8, 1, 4096] - - [342, 0.0] + - [136, 0.0] - - [160, 409600, 1, 48] - - [343, 63144.0] + - [159, 0.0] - - [6400, 2048, 1, 512] - - [344, 367538.0] - - - [2048, 2048, 1, 512] - - [345, 290567.0] + - [112, 0.0] - - [160, 2048, 256, 160] - - [343, 166407.0] + - [159, 0.0] - - [160, 863, 2048, 40] - - [343, 54382.3] + - [140, 0.0] - - [113, 160, 2048, 40] - - [346, 27736.1] + - [62, 0.0] - - [160, 113, 2048, 40] - - [347, 35056.3] + - [158, 0.0] - - [6048, 2048, 1, 256] - - [348, 214542.0] + - [56, 0.0] - - [5120, 2048, 1, 2048] - - [349, 441963.0] + - [154, 0.0] - - [512, 2048, 1, 512] - - [350, 137311.0] + - [49, 0.0] - - [512, 2048, 1, 2560] - - [351, 222160.0] + - [121, 0.0] - - [512, 2048, 1, 2048] - - [351, 206466.0] + - [121, 0.0] - - [4520, 2048, 1, 128] - - [352, 108466.0] + - [56, 0.0] - - [2560, 2048, 1, 4520] - - [353, 437587.0] + - [132, 0.0] - - [2560, 2048, 1, 34520] - - [354, 481011.0] + - [157, 0.0] - - [256, 65536, 1, 160] - - [355, 228530.0] + - [156, 0.0] - - [256, 2048, 1, 6048] - - [356, 175096.0] + - [84, 0.0] - - [2048, 2048, 1, 6048] - - [357, 447625.0] + - [22, 0.0] - - [2048, 2048, 1, 512] - - [345, 286402.0] + - [119, 0.0] - - [16, 327680, 1, 113] - - [358, 12662.0] + - [104, 0.0] - - [128, 2048, 1, 512] - - [351, 37107.5] + - [54, 0.0] + - - [1024, 2048, 1, 5376] + - [0, 0.0] + - - [8192, 2048, 1, 4096] + - [1, 0.0] + - - [2048, 2048, 1, 6144] + - [2, 0.0] + - - [3072, 2048, 1, 768] + - [3, 0.0] + - - [3072, 2048, 1, 1024] + - [4, 0.0] + - - [2048, 2048, 1, 4271] + - [5, 0.0] + - - [768, 2048, 1, 5376] + - [6, 0.0] + - - [160, 2048, 1, 3072] + - [7, 0.0] + - - [2048, 2048, 1, 6176] + - [8, 0.0] + - - [256, 49152, 1, 64] + - [9, 0.0] + - - [4271, 2048, 1, 512] + - [10, 0.0] + - - [1536, 2048, 1, 6176] + - [11, 0.0] + - - [2992, 2048, 1, 512] + - [12, 0.0] + - - [192, 2048, 1, 2048] + - [13, 0.0] + - - [1536, 2048, 1, 3072] + - [14, 0.0] + - - [128, 2048, 1, 2049] + - [15, 0.0] + - - [2050, 2048, 1, 512] + - [16, 0.0] + - - [512, 2048, 1, 4271] + - [17, 0.0] + - - [12288, 2048, 1, 3072] + - [18, 0.0] + - - [5120, 2048, 1, 256] + - [19, 0.0] + - - [2048, 2048, 1, 1536] + - [16, 0.0] + - - [512, 2048, 1, 2304] + - [20, 0.0] + - - [1024, 2048, 1, 5888] + - [21, 0.0] + - - [2048, 2048, 1, 3970] + - [22, 0.0] + - - [2334, 2048, 1, 512] + - [8, 0.0] + - - [1024, 2048, 1, 2416] + - [23, 0.0] + - - [33, 2048, 1, 4517] + - [24, 0.0] + - - [3072, 2048, 1, 3970] + - [4, 0.0] + - - [64, 196608, 1, 1452] + - [25, 0.0] + - - [1536, 2048, 1, 4096] + - [26, 0.0] + - - [6144, 2048, 1, 768] + - [27, 0.0] + - - [768, 2048, 1, 3864] + - [28, 0.0] + - - [2048, 2048, 1, 50176] + - [29, 0.0] + - - [1024, 2048, 1, 4248] + - [30, 0.0] + - - [192, 59392, 1, 96] + - [31, 0.0] + - - [355, 327680, 1, 1221] + - [32, 0.0] + - - [2048, 2048, 1, 19536] + - [33, 0.0] + - - [4096, 2048, 1, 5376] + - [1, 0.0] + - - [512, 2048, 1, 5120] + - [34, 0.0] + - - [1536, 2048, 1, 24704] + - [35, 0.0] + - - [1536, 2048, 1, 2048] + - [36, 0.0] + - - [2048, 2048, 1, 19520] + - [33, 0.0] + - - [1024, 2048, 1, 3072] + - [37, 0.0] + - - [256, 2048, 1, 1280] + - [38, 0.0] + - - [1536, 2048, 1, 3360] + - [39, 0.0] + - - [12288, 2048, 1, 4096] + - [18, 0.0] + - - [2074, 2048, 1, 512] + - [16, 0.0] + - - [3072, 2048, 1, 5376] + - [40, 0.0] + - - [160, 2048, 1, 32] + - [41, 0.0] + - - [160, 2048, 1, 128] + - [42, 0.0] + - - [160, 2048, 1, 2048] + - [13, 0.0] + - - [20, 2048, 1, 1024] + - [43, 0.0] + - - [128, 2048, 1, 2048] + - [7, 0.0] + - - [192, 2048, 1, 120] + - [44, 0.0] + - - [1024, 2048, 1, 4096] + - [37, 0.0] + - - [32, 327680, 1, 36] + - [45, 0.0] + - - [256, 2048, 1, 308] + - [46, 0.0] + - - [4096, 2048, 1, 2048] + - [47, 0.0] + - - [192, 2048, 1, 72] + - [48, 0.0] + - - [512, 2048, 1, 96] + - [49, 0.0] + - - [1723, 2048, 1, 64] + - [46, 0.0] + - - [96, 327680, 1, 99] + - [50, 0.0] + - - [1440, 2048, 1, 614] + - [51, 0.0] + - - [3527, 2048, 1, 512] + - [52, 0.0] + - - [32, 262144, 1, 94] + - [45, 0.0] + - - [256, 2048, 1, 1723] + - [53, 0.0] + - - [160, 2048, 1, 144] + - [41, 0.0] + - - [128, 2048, 1, 96] + - [54, 0.0] + - - [106, 2048, 1, 8648] + - [13, 0.0] + - - [128, 2048, 1, 3072] + - [55, 0.0] + - - [4200, 2048, 1, 256] + - [56, 0.0] + - - [2048, 2048, 1, 2240] + - [5, 0.0] + - - [96, 2048, 1, 512] + - [57, 0.0] + - - [384, 2048, 1, 384] + - [58, 0.0] + - - [768, 2048, 1, 256] + - [59, 0.0] + - - [73, 196608, 1, 103] + - [60, 0.0] + - - [512, 2048, 1, 1770] + - [20, 0.0] + - - [96, 2048, 1, 120] + - [61, 0.0] + - - [128, 2048, 1, 64] + - [54, 0.0] + - - [117, 196608, 1, 89] + - [62, 0.0] + - - [320, 2048, 1, 492] + - [48, 0.0] + - - [160, 2048, 1, 512] + - [63, 0.0] + - - [3200, 2048, 1, 4992] + - [4, 0.0] + - - [128, 2048, 1, 240] + - [54, 0.0] + - - [160, 2048, 1, 2704] + - [15, 0.0] + - - [1024, 2048, 1, 2048] + - [64, 0.0] + - - [117, 196608, 1, 81] + - [62, 0.0] + - - [320, 2048, 1, 64] + - [65, 0.0] + - - [256, 2048, 1, 780] + - [38, 0.0] + - - [1536, 2048, 1, 768] + - [66, 0.0] + - - [64, 2048, 1, 1536] + - [15, 0.0] + - - [96, 2048, 1, 32] + - [57, 0.0] + - - [256, 2048, 1, 2944] + - [67, 0.0] + - - [256, 2048, 1, 256] + - [48, 0.0] + - - [2048, 2048, 1, 3008] + - [16, 0.0] + - - [256, 2048, 1, 2170] + - [68, 0.0] + - - [512, 2048, 1, 192] + - [49, 0.0] + - - [256, 2048, 1, 19604] + - [69, 0.0] + - - [512, 2048, 1, 320] + - [49, 0.0] + - - [4324, 2048, 1, 256] + - [56, 0.0] + - - [5248, 2048, 1, 1024] + - [19, 0.0] + - - [118, 327680, 1, 466] + - [70, 0.0] + - - [384, 2048, 1, 3072] + - [71, 0.0] + - - [2592, 2048, 1, 256] + - [72, 0.0] + - - [180, 2048, 1, 64] + - [48, 0.0] + - - [128, 2048, 1, 144] + - [54, 0.0] + - - [2048, 2048, 1, 1024] + - [73, 0.0] + - - [160, 2048, 1, 780] + - [7, 0.0] + - - [4384, 2048, 1, 768] + - [74, 0.0] + - - [256, 2048, 1, 24] + - [48, 0.0] + - - [256, 2048, 1, 4382] + - [13, 0.0] + - - [308, 2048, 1, 256] + - [48, 0.0] + - - [1024, 2048, 1, 1024] + - [75, 0.0] + - - [7393, 2048, 1, 64] + - [76, 0.0] + - - [270, 2048, 1, 128] + - [54, 0.0] + - - [512, 2048, 1, 129] + - [49, 0.0] + - - [32, 2048, 1, 2049] + - [77, 0.0] + - - [320, 2048, 1, 308] + - [78, 0.0] + - - [160, 2048, 1, 60] + - [41, 0.0] + - - [768, 2048, 1, 6552] + - [28, 0.0] + - - [128, 2048, 1, 492] + - [67, 0.0] + - - [320, 2048, 1, 24] + - [48, 0.0] + - - [2304, 2048, 1, 512] + - [16, 0.0] + - - [16, 327680, 1, 70] + - [45, 0.0] + - - [160, 2048, 1, 120] + - [41, 0.0] + - - [2048, 2048, 1, 3656] + - [22, 0.0] + - - [144, 2048, 1, 64] + - [79, 0.0] + - - [1024, 2048, 1, 132] + - [80, 0.0] + - - [128, 2048, 1, 160] + - [54, 0.0] + - - [16, 196608, 1, 33] + - [45, 0.0] + - - [768, 2048, 1, 2048] + - [81, 0.0] + - - [1944, 2048, 1, 512] + - [46, 0.0] + - - [614, 2048, 1, 128] + - [82, 0.0] + - - [256, 2048, 1, 4580] + - [13, 0.0] + - - [256, 2048, 1, 2452] + - [68, 0.0] + - - [128, 2048, 1, 32] + - [54, 0.0] + - - [4382, 2048, 1, 128] + - [56, 0.0] + - - [192, 2048, 1, 24] + - [83, 0.0] + - - [150, 2048, 1, 8648] + - [84, 0.0] + - - [1536, 2048, 1, 2592] + - [22, 0.0] + - - [4096, 2048, 1, 512] + - [10, 0.0] + - - [48, 49152, 1, 48] + - [85, 0.0] + - - [2704, 2048, 1, 160] + - [12, 0.0] + - - [256, 2048, 1, 768] + - [38, 0.0] + - - [64, 409600, 1, 64] + - [46, 0.0] + - - [128, 2048, 1, 256] + - [86, 0.0] + - - [64, 2048, 1, 1920] + - [43, 0.0] + - - [24, 81920, 1, 25] + - [45, 0.0] + - - [160, 2048, 1, 96] + - [41, 0.0] + - - [2048, 2048, 1, 768] + - [16, 0.0] + - - [96, 2048, 1, 144] + - [57, 0.0] + - - [128, 65536, 1, 64] + - [46, 0.0] + - - [10, 196608, 1, 33] + - [45, 0.0] + - - [512, 2048, 1, 120] + - [49, 0.0] + - - [320, 2048, 1, 252] + - [48, 0.0] + - - [2040, 2048, 1, 256] + - [16, 0.0] + - - [4864, 2048, 1, 896] + - [87, 0.0] + - - [256, 2048, 1, 72] + - [48, 0.0] + - - [64, 2048, 1, 7393] + - [15, 0.0] + - - [1440, 2048, 1, 768] + - [88, 0.0] + - - [256, 2048, 1, 2048] + - [68, 0.0] + - - [256, 2048, 1, 340] + - [46, 0.0] + - - [128, 2048, 1, 128] + - [50, 0.0] + - - [2048, 2048, 1, 81536] + - [74, 0.0] + - - [7808, 2048, 1, 512] + - [89, 0.0] + - - [4382, 2048, 1, 256] + - [56, 0.0] + - - [192, 2048, 1, 240] + - [90, 0.0] + - - [96, 2048, 1, 160] + - [57, 0.0] + - - [256, 2048, 1, 128] + - [48, 0.0] + - - [640, 2048, 1, 288] + - [91, 0.0] + - - [160, 2048, 1, 72] + - [62, 0.0] + - - [160, 2048, 1, 192] + - [41, 0.0] + - - [24, 262144, 1, 73] + - [45, 0.0] + - - [512, 2048, 1, 10240] + - [92, 0.0] + - - [2048, 2048, 1, 3527] + - [5, 0.0] + - - [128, 2048, 1, 1810] + - [15, 0.0] + - - [2592, 2048, 1, 384] + - [72, 0.0] + - - [96, 2048, 1, 64] + - [57, 0.0] + - - [320, 2048, 1, 612] + - [93, 0.0] + - - [1536, 2048, 1, 3200] + - [39, 0.0] + - - [128, 2048, 1, 612] + - [54, 0.0] + - - [64, 65536, 1, 128] + - [78, 0.0] + - - [1536, 2048, 1, 1024] + - [94, 0.0] + - - [2048, 2048, 1, 6336] + - [8, 0.0] + - - [512, 2048, 1, 1959] + - [20, 0.0] + - - [2048, 2048, 1, 19936] + - [33, 0.0] + - - [2048, 2048, 1, 7808] + - [2, 0.0] + - - [15, 196608, 1, 27] + - [45, 0.0] + - - [256, 2048, 1, 3824] + - [13, 0.0] + - - [96, 2048, 1, 2048] + - [95, 0.0] + - - [320, 2048, 1, 256] + - [48, 0.0] + - - [80, 24576, 1, 40] + - [96, 0.0] + - - [3840, 2048, 1, 2048] + - [97, 0.0] + - - [960, 2048, 1, 128] + - [98, 0.0] + - - [19604, 2048, 1, 256] + - [99, 0.0] + - - [6336, 2048, 1, 512] + - [100, 0.0] + - - [156, 2048, 1, 840] + - [7, 0.0] + - - [256, 2048, 1, 1740] + - [68, 0.0] + - - [512, 2048, 1, 160] + - [49, 0.0] + - - [1152, 2048, 1, 128] + - [101, 0.0] + - - [180, 2048, 1, 72] + - [48, 0.0] + - - [528, 2048, 1, 256] + - [102, 0.0] + - - [64, 2048, 1, 64] + - [103, 0.0] + - - [64, 2048, 1, 1783] + - [43, 0.0] + - - [128, 2048, 1, 552] + - [67, 0.0] + - - [256, 2048, 1, 1536] + - [68, 0.0] + - - [10, 196608, 1, 142] + - [104, 0.0] + - - [512, 2048, 1, 1024] + - [105, 0.0] + - - [100, 2048, 1, 200] + - [90, 0.0] + - - [512, 2048, 1, 3104] + - [34, 0.0] + - - [144, 196608, 1, 623] + - [106, 0.0] + - - [192, 2048, 1, 528] + - [48, 0.0] + - - [68, 327680, 1, 488] + - [107, 0.0] + - - [1024, 2048, 1, 3824] + - [21, 0.0] + - - [256, 2048, 1, 136] + - [48, 0.0] + - - [512, 2048, 1, 3527] + - [108, 0.0] + - - [24, 2048, 1, 256] + - [109, 0.0] + - - [256, 2048, 1, 64] + - [48, 0.0] + - - [160, 2048, 1, 240] + - [41, 0.0] + - - [32, 196608, 1, 81] + - [45, 0.0] + - - [19, 196608, 1, 407] + - [45, 0.0] + - - [2944, 2048, 1, 256] + - [110, 0.0] + - - [768, 2048, 1, 4384] + - [81, 0.0] + - - [4096, 2048, 1, 1024] + - [111, 0.0] + - - [6400, 2048, 1, 256] + - [112, 0.0] + - - [132, 2048, 1, 256] + - [113, 0.0] + - - [180, 2048, 1, 128] + - [48, 0.0] + - - [256, 2048, 1, 4324] + - [13, 0.0] + - - [128, 2048, 1, 3200] + - [55, 0.0] + - - [96, 2048, 1, 72] + - [70, 0.0] + - - [2048, 2048, 1, 1770] + - [16, 0.0] + - - [960, 2048, 1, 256] + - [86, 0.0] + - - [1536, 2048, 1, 512] + - [114, 0.0] + - - [128, 2048, 1, 72] + - [50, 0.0] + - - [5120, 2048, 1, 512] + - [115, 0.0] + - - [512, 2048, 1, 64] + - [49, 0.0] + - - [640, 2048, 1, 512] + - [91, 0.0] + - - [4096, 2048, 1, 9728] + - [116, 0.0] + - - [768, 2048, 1, 5120] + - [81, 0.0] + - - [128, 2048, 1, 1536] + - [7, 0.0] + - - [195, 327680, 1, 1274] + - [99, 0.0] + - - [320, 2048, 1, 780] + - [117, 0.0] + - - [312, 2048, 1, 136] + - [48, 0.0] + - - [512, 2048, 1, 256] + - [49, 0.0] + - - [168, 262144, 1, 699] + - [118, 0.0] + - - [256, 2048, 1, 840] + - [38, 0.0] + - - [2048, 2048, 1, 256] + - [119, 0.0] + - - [512, 2048, 1, 32] + - [49, 0.0] + - - [512, 2048, 1, 1783] + - [20, 0.0] + - - [128, 2048, 1, 960] + - [57, 0.0] + - - [3072, 2048, 1, 512] + - [120, 0.0] + - - [96, 2048, 1, 1024] + - [71, 0.0] + - - [160, 2048, 1, 64] + - [41, 0.0] + - - [512, 2048, 1, 2944] + - [121, 0.0] + - - [128, 2048, 1, 5050] + - [55, 0.0] + - - [20, 163840, 1, 39] + - [45, 0.0] + - - [256, 2048, 1, 2304] + - [68, 0.0] + - - [1536, 2048, 1, 42400] + - [35, 0.0] + - - [1752, 2048, 1, 512] + - [78, 0.0] + - - [512, 2048, 1, 72] + - [41, 0.0] + - - [96, 2048, 1, 128] + - [57, 0.0] + - - [2048, 2048, 1, 5248] + - [22, 0.0] + - - [2048, 2048, 1, 1959] + - [5, 0.0] + - - [512, 2048, 1, 144] + - [49, 0.0] + - - [3200, 2048, 1, 1536] + - [120, 0.0] + - - [70, 262144, 1, 607] + - [107, 0.0] + - - [360, 2048, 1, 708] + - [117, 0.0] + - - [6552, 2048, 1, 768] + - [122, 0.0] + - - [3072, 2048, 1, 1536] + - [120, 0.0] + - - [128, 2048, 1, 120] + - [54, 0.0] + - - [512, 2048, 1, 240] + - [49, 0.0] + - - [128, 2048, 1, 192] + - [54, 0.0] + - - [3072, 2048, 1, 384] + - [123, 0.0] + - - [57, 327680, 1, 1060] + - [25, 0.0] + - - [768, 2048, 1, 1536] + - [124, 0.0] + - - [1024, 2048, 1, 512] + - [46, 0.0] + - - [81536, 2048, 1, 512] + - [125, 0.0] + - - [180, 2048, 1, 32] + - [83, 0.0] + - - [1056, 2048, 1, 512] + - [126, 0.0] + - - [128, 2048, 1, 28] + - [54, 0.0] + - - [64, 65536, 1, 64] + - [78, 0.0] + - - [1792, 2048, 1, 1944] + - [5, 0.0] + - - [1959, 2048, 1, 512] + - [78, 0.0] + - - [256, 2048, 1, 320] + - [127, 0.0] + - - [2560, 2048, 1, 512] + - [110, 0.0] + - - [1792, 2048, 1, 2848] + - [5, 0.0] + - - [19424, 2048, 1, 512] + - [128, 0.0] + - - [256, 2048, 1, 3072] + - [13, 0.0] + - - [2048, 2048, 1, 3104] + - [22, 0.0] + - - [128, 2048, 1, 1740] + - [7, 0.0] + - - [180, 2048, 1, 96] + - [48, 0.0] + - - [48, 204800, 1, 48] + - [129, 0.0] + - - [256, 2048, 1, 132] + - [130, 0.0] + - - [576, 2048, 1, 512] + - [58, 0.0] + - - [1280, 2048, 1, 960] + - [8, 0.0] + - - [312, 2048, 1, 816] + - [131, 0.0] + - - [128, 2048, 1, 80] + - [54, 0.0] + - - [2688, 2048, 1, 4096] + - [132, 0.0] + - - [512, 2048, 1, 2200] + - [20, 0.0] + - - [2048, 2048, 1, 6552] + - [2, 0.0] + - - [256, 2048, 1, 578] + - [133, 0.0] + - - [3072, 2048, 1, 256] + - [134, 0.0] + - - [192, 2048, 1, 480] + - [54, 0.0] + - - [3656, 2048, 1, 512] + - [52, 0.0] + - - [720, 2048, 1, 288] + - [135, 0.0] + - - [384, 2048, 1, 1959] + - [67, 0.0] + - - [32, 2048, 1, 960] + - [136, 0.0] + - - [96, 2048, 1, 96] + - [61, 0.0] + - - [1024, 2048, 1, 2704] + - [23, 0.0] + - - [256, 2048, 1, 432] + - [48, 0.0] + - - [3824, 2048, 1, 256] + - [137, 0.0] + - - [42400, 2048, 1, 256] + - [89, 0.0] + - - [512, 2048, 1, 1536] + - [121, 0.0] + - - [32, 327680, 1, 105] + - [45, 0.0] + - - [32, 2048, 1, 3200] + - [138, 0.0] + - - [512, 2048, 1, 3656] + - [17, 0.0] + - - [360, 2048, 1, 128] + - [54, 0.0] + - - [1536, 2048, 1, 4200] + - [39, 0.0] + - - [1536, 2048, 1, 19604] + - [74, 0.0] + - - [128, 2048, 1, 1152] + - [139, 0.0] + - - [4580, 2048, 1, 256] + - [56, 0.0] + - - [896, 2048, 1, 4864] + - [92, 0.0] + - - [2848, 2048, 1, 512] + - [110, 0.0] + - - [184, 262144, 1, 607] + - [140, 0.0] + - - [48, 49152, 1, 128] + - [85, 0.0] + - - [49, 196608, 1, 407] + - [60, 0.0] + - - [1536, 2048, 1, 4580] + - [39, 0.0] + - - [3104, 2048, 1, 512] + - [120, 0.0] + - - [2048, 2048, 1, 7652] + - [2, 0.0] + - - [1770, 2048, 1, 512] + - [78, 0.0] + - - [19936, 2048, 1, 512] + - [141, 0.0] + - - [32, 2048, 1, 2170] + - [77, 0.0] + - - [128, 2048, 1, 7084] + - [13, 0.0] + - - [128, 2048, 1, 708] + - [15, 0.0] + - - [10, 196608, 1, 407] + - [45, 0.0] + - - [512, 2048, 1, 4096] + - [17, 0.0] + - - [1024, 2048, 1, 5248] + - [142, 0.0] + - - [256, 2048, 1, 5888] + - [143, 0.0] + - - [2560, 2048, 1, 2048] + - [144, 0.0] + - - [97, 262144, 1, 73] + - [62, 0.0] + - - [360, 2048, 1, 80] + - [54, 0.0] + - - [2048, 2048, 1, 576] + - [119, 0.0] + - - [2240, 2048, 1, 512] + - [16, 0.0] + - - [288, 2048, 1, 1200] + - [67, 0.0] + - - [65, 196608, 1, 623] + - [107, 0.0] + - - [512, 2048, 1, 128] + - [145, 0.0] + - - [8, 2048, 1, 32] + - [146, 0.0] + - - [48, 409600, 1, 48] + - [129, 0.0] + - - [1783, 2048, 1, 64] + - [46, 0.0] + - - [128, 2048, 1, 1440] + - [15, 0.0] + - - [4096, 2048, 1, 7936] + - [116, 0.0] + - - [64, 327680, 1, 99] + - [46, 0.0] + - - [208, 327680, 1, 1060] + - [147, 0.0] + - - [512, 2048, 1, 768] + - [121, 0.0] + - - [10240, 2048, 1, 512] + - [148, 0.0] + - - [134, 262144, 1, 94] + - [140, 0.0] + - - [10, 196608, 1, 29] + - [45, 0.0] + - - [3072, 2048, 1, 6144] + - [149, 0.0] + - - [768, 2048, 1, 7393] + - [150, 0.0] + - - [121, 327680, 1, 105] + - [127, 0.0] + - - [256, 2048, 1, 2080] + - [68, 0.0] + - - [6105, 2048, 1, 256] + - [56, 0.0] + - - [128, 49152, 1, 48] + - [151, 0.0] + - - [48, 327680, 1, 70] + - [129, 0.0] + - - [2048, 2048, 1, 19424] + - [33, 0.0] + - - [1440, 2048, 1, 256] + - [152, 0.0] + - - [49, 262144, 1, 699] + - [147, 0.0] + - - [160, 2048, 1, 4580] + - [15, 0.0] + - - [512, 2048, 1, 3840] + - [153, 0.0] + - - [1024, 2048, 1, 256] + - [80, 0.0] + - - [16776, 2048, 1, 512] + - [128, 0.0] + - - [10240, 2048, 1, 3200] + - [154, 0.0] + - - [64, 2048, 1, 256] + - [103, 0.0] + - - [1440, 2048, 1, 128] + - [152, 0.0] + - - [2048, 2048, 1, 2048] + - [29, 0.0] + - - [256, 2048, 1, 2040] + - [68, 0.0] + - - [495, 2048, 1, 256] + - [48, 0.0] + - - [1648, 2048, 1, 256] + - [152, 0.0] + - - [2048, 2048, 1, 4384] + - [22, 0.0] + - - [4096, 2048, 1, 5120] + - [1, 0.0] + - - [2048, 2048, 1, 14912] + - [74, 0.0] + - - [512, 2048, 1, 2068] + - [121, 0.0] + - - [576, 2048, 1, 2048] + - [20, 0.0] + - - [16, 327680, 1, 36] + - [45, 0.0] + - - [576, 2048, 1, 256] + - [102, 0.0] + - - [780, 2048, 1, 256] + - [145, 0.0] + - - [256, 2048, 1, 7652] + - [17, 0.0] + - - [144, 2048, 1, 252] + - [93, 0.0] + - - [128, 2048, 1, 1920] + - [15, 0.0] + - - [256, 2048, 1, 816] + - [63, 0.0] + - - [7808, 2048, 1, 256] + - [128, 0.0] + - - [435, 2048, 1, 128] + - [49, 0.0] + - - [64, 2048, 1, 1723] + - [43, 0.0] + - - [3008, 2048, 1, 512] + - [12, 0.0] + - - [15, 163840, 1, 90] + - [104, 0.0] + - - [128, 2048, 1, 2191] + - [15, 0.0] + - - [32, 196608, 1, 33] + - [45, 0.0] + - - [768, 2048, 1, 512] + - [98, 0.0] + - - [12, 196608, 1, 29] + - [45, 0.0] + - - [7652, 2048, 1, 256] + - [76, 0.0] + - - [14912, 2048, 1, 512] + - [155, 0.0] + - - [96, 2048, 1, 240] + - [57, 0.0] + - - [1536, 2048, 1, 256] + - [8, 0.0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml index c23fdb7d26a..9545fcd9e92 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml @@ -1300,210 +1300,6 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 1 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 2 - AssertSummationElementMultiple: 32 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: Custom_Cijk_Alik_Bljk_BBS_BH_MT128x16x128_MI16x16x1_SN_GSUM_K1_MIWT2_1_01 - DebugStreamK: 0 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 4 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomWGM: true, SupportUserGSU: true} - KernelLanguage: Assembly - KernelNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_MT128x16x128_MI16x16x1_SN_GSUM_K1_MIWT2_1_01 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumElements: 19712 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 19712 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: 0.4 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: 0 - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: -1 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_MT128x16x128_MI16x16x1_SN_GSUM_K1_MIWT2_1_01 - SourceSwap: 0 - StaggerU: 4 - StaggerUMapping: 2 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 4] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false @@ -1650,7 +1446,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 7 + SolutionIndex: 6 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM2_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU10_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB1_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -1853,7 +1649,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 8 + SolutionIndex: 7 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM2_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU8_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM2_SUS512_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB1_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 @@ -2056,7 +1852,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 9 + SolutionIndex: 8 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM2_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU15_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM2_SUS256_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB1_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 @@ -2113,14 +1909,14 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 1 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 2 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -2130,9 +1926,9 @@ ClusterLocalRead: 1 CodeObjectVersion: default ConvertAfterDS: false - CustomKernelName: Custom_Cijk_Alik_Bljk_BBS_BH_MT16x16x128_MI16x16x1_SN_GSUM_K1_MIWT1_1_3 + CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -2145,13 +1941,13 @@ ExpandPointerSwap: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 7 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2159,47 +1955,48 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomWGM: true, SupportUserGSU: true} + InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, + UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_MT16x16x128_MI16x16x1_SN_GSUM_K1_MIWT1_1_3 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 1 - LSPB: 4 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 12800 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2304 + LdsNumBytes: 64384 + LdsNumElementsAlignedA: 29568 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 29568 + LdsOffsetB_Blk: 95104 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LdsOffsetMetadata: 64384 + LdsOffsetMetadata_Blk: 95104 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: -1 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 @@ -2207,220 +2004,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [14, 4] + MIWaveTileA: 14 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: -1 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_MT16x16x128_MI16x16x1_SN_GSUM_K1_MIWT1_1_3 - SourceSwap: 0 - StaggerU: 4 - StaggerUMapping: 2 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 7] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 8 - AssertSummationElementMultiple: 32 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 64384 - LdsNumElementsAlignedA: 29568 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 29568 - LdsOffsetB_Blk: 95104 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 64384 - LdsOffsetMetadata_Blk: 95104 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: -1 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [14, 4] - MIWaveTileA: 14 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 256 - MacroTileA: 224 - MacroTileB: 256 + MacroTile0: 224 + MacroTile1: 256 + MacroTileA: 224 + MacroTileB: 256 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -2464,7 +2056,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 11 + SolutionIndex: 9 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -2668,7 +2260,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 12 + SolutionIndex: 10 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -2872,7 +2464,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 13 + SolutionIndex: 11 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -3076,7 +2668,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 14 + SolutionIndex: 12 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -3280,7 +2872,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 15 + SolutionIndex: 13 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -3484,7 +3076,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 16 + SolutionIndex: 14 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -3688,7 +3280,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 17 + SolutionIndex: 15 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -3892,7 +3484,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 18 + SolutionIndex: 16 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -4096,7 +3688,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 19 + SolutionIndex: 17 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -4300,7 +3892,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 20 + SolutionIndex: 18 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -4504,7 +4096,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 21 + SolutionIndex: 19 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -4708,7 +4300,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 22 + SolutionIndex: 20 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -4912,7 +4504,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 23 + SolutionIndex: 21 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -5116,7 +4708,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 24 + SolutionIndex: 22 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 @@ -5320,7 +4912,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 25 + SolutionIndex: 23 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -5524,7 +5116,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 26 + SolutionIndex: 24 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 @@ -5728,7 +5320,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 27 + SolutionIndex: 25 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -5932,7 +5524,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 28 + SolutionIndex: 26 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -6136,7 +5728,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 29 + SolutionIndex: 27 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -6340,7 +5932,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 30 + SolutionIndex: 28 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -6544,7 +6136,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 31 + SolutionIndex: 29 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -6748,7 +6340,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 32 + SolutionIndex: 30 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -6952,7 +6544,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 33 + SolutionIndex: 31 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -7156,7 +6748,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 34 + SolutionIndex: 32 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -7360,7 +6952,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 35 + SolutionIndex: 33 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -7564,7 +7156,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 36 + SolutionIndex: 34 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT208x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT13_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -7768,7 +7360,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 37 + SolutionIndex: 35 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT288x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT9_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -7972,7 +7564,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 38 + SolutionIndex: 36 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -8176,7 +7768,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 39 + SolutionIndex: 37 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -8380,7 +7972,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 40 + SolutionIndex: 38 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -8584,7 +8176,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 41 + SolutionIndex: 39 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -8788,7 +8380,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 42 + SolutionIndex: 40 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -8992,7 +8584,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 43 + SolutionIndex: 41 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -9196,7 +8788,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 44 + SolutionIndex: 42 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -9400,7 +8992,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 45 + SolutionIndex: 43 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -9604,7 +9196,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 46 + SolutionIndex: 44 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -9808,7 +9400,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 47 + SolutionIndex: 45 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -10012,7 +9604,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 48 + SolutionIndex: 46 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -10216,7 +9808,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 49 + SolutionIndex: 47 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -10420,7 +10012,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 50 + SolutionIndex: 48 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -10624,7 +10216,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 51 + SolutionIndex: 49 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -10828,7 +10420,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 52 + SolutionIndex: 50 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -11032,7 +10624,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 53 + SolutionIndex: 51 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -11236,7 +10828,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 54 + SolutionIndex: 52 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -11440,7 +11032,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 55 + SolutionIndex: 53 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -11644,7 +11236,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 56 + SolutionIndex: 54 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -11848,7 +11440,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 57 + SolutionIndex: 55 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -12052,7 +11644,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 58 + SolutionIndex: 56 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 @@ -12256,7 +11848,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 59 + SolutionIndex: 57 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -12460,7 +12052,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 60 + SolutionIndex: 58 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -12664,7 +12256,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 61 + SolutionIndex: 59 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -12868,7 +12460,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 62 + SolutionIndex: 60 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 @@ -13072,7 +12664,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 63 + SolutionIndex: 61 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -13276,7 +12868,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 64 + SolutionIndex: 62 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -13480,7 +13072,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 65 + SolutionIndex: 63 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -13684,7 +13276,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 66 + SolutionIndex: 64 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -13888,7 +13480,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 67 + SolutionIndex: 65 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -14092,7 +13684,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 68 + SolutionIndex: 66 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -14296,7 +13888,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 69 + SolutionIndex: 67 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -14500,7 +14092,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 70 + SolutionIndex: 68 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -14704,7 +14296,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 71 + SolutionIndex: 69 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM1_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -14908,7 +14500,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 72 + SolutionIndex: 70 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM1_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 @@ -15112,7 +14704,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 73 + SolutionIndex: 71 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM1_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU8_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU32_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 32 @@ -15317,7 +14909,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 74 + SolutionIndex: 72 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -15521,7 +15113,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 75 + SolutionIndex: 73 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 @@ -15725,7 +15317,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 76 + SolutionIndex: 74 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -15929,7 +15521,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 77 + SolutionIndex: 75 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -16133,7 +15725,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 78 + SolutionIndex: 76 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 @@ -16337,7 +15929,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 79 + SolutionIndex: 77 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -16541,7 +16133,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 80 + SolutionIndex: 78 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -16745,7 +16337,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 81 + SolutionIndex: 79 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 @@ -16949,7 +16541,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 82 + SolutionIndex: 80 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -17153,7 +16745,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 83 + SolutionIndex: 81 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -17357,7 +16949,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 84 + SolutionIndex: 82 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -17561,7 +17153,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 85 + SolutionIndex: 83 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT16_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC4_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -17765,7 +17357,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 86 + SolutionIndex: 84 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -17969,7 +17561,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 87 + SolutionIndex: 85 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -18173,7 +17765,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 88 + SolutionIndex: 86 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -18379,7 +17971,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 89 + SolutionIndex: 87 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB1_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCG0 SourceSwap: true StaggerU: 8 @@ -18586,7 +18178,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 90 + SolutionIndex: 88 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA1_NTB1_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA2_WSGRB2_WS64_WG32_8_1_WGMn16_WGMXCC1_WGMXCCG0 SourceSwap: true StaggerU: 32 @@ -18793,7 +18385,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 91 + SolutionIndex: 89 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT7_4_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCG0 SourceSwap: false StaggerU: 8 @@ -19000,7 +18592,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 92 + SolutionIndex: 90 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA1_NTB2_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG0 SourceSwap: true StaggerU: 0 @@ -19207,7 +18799,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 93 + SolutionIndex: 91 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA3_NTB1_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCG0 SourceSwap: true StaggerU: 8 @@ -19414,7 +19006,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 94 + SolutionIndex: 92 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU3_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCG4 SourceSwap: 1 StaggerU: 8 @@ -19621,7 +19213,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 95 + SolutionIndex: 93 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -19828,7 +19420,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 96 + SolutionIndex: 94 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCG0 SourceSwap: 1 StaggerU: 8 @@ -20035,7 +19627,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 97 + SolutionIndex: 95 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -20242,7 +19834,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 98 + SolutionIndex: 96 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x16x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU5_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -20449,7 +20041,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 99 + SolutionIndex: 97 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -20656,7 +20248,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 100 + SolutionIndex: 98 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -20863,7 +20455,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 101 + SolutionIndex: 99 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -21070,7 +20662,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 102 + SolutionIndex: 100 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 16 @@ -21277,7 +20869,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 103 + SolutionIndex: 101 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 16 @@ -21484,7 +21076,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 104 + SolutionIndex: 102 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG0 SourceSwap: 1 StaggerU: 16 @@ -21691,7 +21283,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 105 + SolutionIndex: 103 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA1_WSGRB1_WS64_WG16_8_2_WGM1_WGMXCC1_WGMXCCG0 SourceSwap: 1 StaggerU: 16 @@ -21898,7 +21490,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 106 + SolutionIndex: 104 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG0 SourceSwap: 1 StaggerU: 16 @@ -22105,7 +21697,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 107 + SolutionIndex: 105 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG0 SourceSwap: 1 StaggerU: 8 @@ -22312,7 +21904,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 108 + SolutionIndex: 106 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT160x320x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA1_WSGRB1_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCG0 SourceSwap: 1 StaggerU: 8 @@ -22519,7 +22111,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 109 + SolutionIndex: 107 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 8 @@ -22727,7 +22319,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 110 + SolutionIndex: 108 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCG304 SourceSwap: 1 StaggerU: 16 @@ -22935,7 +22527,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 111 + SolutionIndex: 109 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 16 @@ -23143,7 +22735,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 112 + SolutionIndex: 110 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM4_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 16 @@ -23351,7 +22943,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 113 + SolutionIndex: 111 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB8_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 16 @@ -23559,7 +23151,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 114 + SolutionIndex: 112 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 16 @@ -23767,7 +23359,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 115 + SolutionIndex: 113 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 16 @@ -23974,7 +23566,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 116 + SolutionIndex: 114 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -24182,7 +23774,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 117 + SolutionIndex: 115 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -24391,7 +23983,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 118 + SolutionIndex: 116 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM3_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB8_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM95_WGMXCC4_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -24600,7 +24192,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 119 + SolutionIndex: 117 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM128_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p30_MIAV0_MIWT2_5_MO40_NTn1_NTA1_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 2 @@ -24810,7 +24402,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 120 + SolutionIndex: 118 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM4_AFEM1_ASEM8_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p40_GRVWA8_GRVWB8_GSU2_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p30_MIAV0_MIWT4_3_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS1024_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCG266 SourceSwap: 1 StaggerU: 4 @@ -25020,7 +24612,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 121 + SolutionIndex: 119 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM16_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO1_SRVW0_SSO1_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 8 @@ -25229,7 +24821,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 122 + SolutionIndex: 120 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM2_AFEM4_ASEM8_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM2_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA1_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 @@ -25438,7 +25030,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 123 + SolutionIndex: 121 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB4_NTC3_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS0_SU64_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC8_WGMXCCG0 SourceSwap: 0 StaggerU: 64 @@ -25647,7 +25239,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 124 + SolutionIndex: 122 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM4_AFEM1_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p27_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA2_WSGRB2_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCG152 SourceSwap: 1 StaggerU: 4 @@ -25857,7 +25449,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 125 + SolutionIndex: 123 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB1_WSGRA1_WSGRB2_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCG0 SourceSwap: 1 StaggerU: 8 @@ -26066,7 +25658,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 126 + SolutionIndex: 124 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 16 @@ -26277,7 +25869,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 127 + SolutionIndex: 125 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 8 @@ -26487,7 +26079,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 128 + SolutionIndex: 126 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 0 @@ -26701,7 +26293,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 129 + SolutionIndex: 127 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p25_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM133_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -26920,7 +26512,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 130 + SolutionIndex: 128 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p30_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCG0 SourceSwap: 1 StaggerU: 4 @@ -27139,7 +26731,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 131 + SolutionIndex: 129 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p27_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC7_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM4_SUS128_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCG0 SourceSwap: 1 StaggerU: 4 @@ -27358,7 +26950,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 132 + SolutionIndex: 130 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -27577,7 +27169,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 133 + SolutionIndex: 131 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCG0 SourceSwap: 1 StaggerU: 32 @@ -27796,7 +27388,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 134 + SolutionIndex: 132 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p50_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -28015,7 +27607,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 135 + SolutionIndex: 133 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -28234,7 +27826,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 136 + SolutionIndex: 134 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA1_WSGRB0_WS64_WG32_8_1_WGM12_WGMXCC8_WGMXCCG0 SourceSwap: 1 StaggerU: 8 @@ -28453,7 +28045,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 137 + SolutionIndex: 135 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x80x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC4_WGMXCCG0 SourceSwap: 1 StaggerU: 4 @@ -28672,7 +28264,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 138 + SolutionIndex: 136 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p40_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p90_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM3_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG0 SourceSwap: 1 StaggerU: 2 @@ -28892,7 +28484,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 139 + SolutionIndex: 137 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p27_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS256_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM0_WGMXCC32_WGMXCCG0 SourceSwap: 1 StaggerU: 4 @@ -29111,7 +28703,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 140 + SolutionIndex: 138 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -29330,7 +28922,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 141 + SolutionIndex: 139 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p90_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 @@ -29549,7 +29141,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 142 + SolutionIndex: 140 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 @@ -29768,7 +29360,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 143 + SolutionIndex: 141 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p90_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB4_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 2 @@ -29987,7 +29579,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 144 + SolutionIndex: 142 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA2_WSGRB2_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 SourceSwap: 1 StaggerU: 2 @@ -30206,7 +29798,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 145 + SolutionIndex: 143 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM1_SUS128_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 2 @@ -30425,7 +30017,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 146 + SolutionIndex: 144 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM0p27_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW8_LWPM0p80_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA2_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_1_WGM266_WGMXCC1_WGMXCCG0 SourceSwap: 1 StaggerU: 2 @@ -30644,7 +30236,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 147 + SolutionIndex: 145 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 @@ -30863,7 +30455,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 148 + SolutionIndex: 146 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p70_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT3_10_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM3_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA2_WSGRB2_WS64_WG64_4_1_WGM0_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 @@ -31082,7 +30674,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 149 + SolutionIndex: 147 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG0 SourceSwap: 1 StaggerU: 0 @@ -31302,7 +30894,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 150 + SolutionIndex: 148 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x80x16_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p60_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS64_SPO0_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 @@ -31521,7 +31113,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 151 + SolutionIndex: 149 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU64_SUM2_SUS2048_SPO1_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG0 SourceSwap: 1 StaggerU: 64 @@ -31740,7 +31332,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 152 + SolutionIndex: 150 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB1_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA16_LPB0_LPM0_LRVW8_LWPM0p80_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS6_NLCA1_NLCB2_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM2_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC2_WGMXCCGn1 SourceSwap: 1 StaggerU: 2 @@ -31959,7 +31551,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 153 + SolutionIndex: 151 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO2_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -32178,7 +31770,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 154 + SolutionIndex: 152 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p26_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p25_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM4_SUS128_SPO1_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC4_WGMXCCG0 SourceSwap: 1 StaggerU: 4 @@ -32398,7 +31990,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 155 + SolutionIndex: 153 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x80x16_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p90_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC1_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM95_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 @@ -32617,7 +32209,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 156 + SolutionIndex: 154 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p50_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG0 SourceSwap: 1 StaggerU: 4 @@ -32836,7 +32428,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 157 + SolutionIndex: 155 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p25_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM1_SUS1024_SPO1_SRVW0_SSO2_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 2 @@ -33055,7 +32647,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 158 + SolutionIndex: 156 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM0p27_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG32 SourceSwap: 1 StaggerU: 4 @@ -33275,7 +32867,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 159 + SolutionIndex: 157 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB16_LPM0_LRVW8_LWPM0p20_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC7_NTD0_NTM0_NEPBS8_NLCA8_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM1_SUS512_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB1_WS64_WG16_4_1_WGM1_WGMXCC16_WGMXCCG0 SourceSwap: 1 StaggerU: 4 @@ -33494,7 +33086,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 160 + SolutionIndex: 158 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p70_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM0p20_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB7_NTC0_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM4_SUS128_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB1_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM76_WGMXCC32_WGMXCCG0 SourceSwap: 1 StaggerU: 16 @@ -33713,7 +33305,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 161 + SolutionIndex: 159 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x80x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p80_GRVWA4_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS64_SPO1_SRVW0_SSO3_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA1_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 2 @@ -33932,7 +33524,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 162 + SolutionIndex: 160 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p90_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM0p40_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC7_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM1_SUS256_SPO0_SRVW0_SSO1_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 @@ -34151,7 +33743,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 163 + SolutionIndex: 161 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x48x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p70_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPM1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU4_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC8_WGMXCCG0 SourceSwap: 0 StaggerU: 4 @@ -34370,7 +33962,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 164 + SolutionIndex: 162 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT320x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p20_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM0p25_MIAV0_MIWT10_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 2 @@ -34589,7 +34181,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 165 + SolutionIndex: 163 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS128_SPO1_SRVW0_SSO3_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 2 @@ -34808,7 +34400,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 166 + SolutionIndex: 164 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x320x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB1_EPS0_FDSI0_GRPM0p80_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA16_LPB0_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB2_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM4_SUS256_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA2_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -35027,7 +34619,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 167 + SolutionIndex: 165 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p60_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA2_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCG0 SourceSwap: 1 StaggerU: 2 @@ -35247,7 +34839,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 168 + SolutionIndex: 166 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPM1_MIAV0_MIWT1_5_MO40_NTn1_NTA4_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL2_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM2_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA1_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 @@ -35462,7 +35054,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 169 + SolutionIndex: 167 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA2_WSGRB2_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCG0 SourceSwap: 1 StaggerU: 8 @@ -35669,7 +35261,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 170 + SolutionIndex: 168 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p50_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCG304 SourceSwap: 1 StaggerU: 0 @@ -35877,7 +35469,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 171 + SolutionIndex: 169 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM16_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCG304 SourceSwap: 1 StaggerU: 0 @@ -36084,7 +35676,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 172 + SolutionIndex: 170 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA2_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCG0 SourceSwap: 1 StaggerU: 8 @@ -36300,7 +35892,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 173 + SolutionIndex: 171 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 @@ -36515,7 +36107,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 174 + SolutionIndex: 172 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 @@ -36731,7 +36323,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 175 + SolutionIndex: 173 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 @@ -36956,7 +36548,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 176 + SolutionIndex: 174 SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x48x256_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 StaggerU: 16 @@ -37078,415 +36670,411 @@ - - [1024, 65536, 1, 8192] - [0, 0.0] - - [8192, 1, 1, 1024] - - [72, 0.0] + - [70, 0.0] - - [8192, 2, 1, 1024] - - [72, 0.0] + - [70, 0.0] - - [8192, 4, 1, 1024] - - [72, 0.0] + - [70, 0.0] - - [8192, 8, 1, 1024] - - [58, 0.0] + - [56, 0.0] - - [8192, 16, 1, 1024] - - [44, 0.0] + - [42, 0.0] - - [8192, 32, 1, 1024] - - [44, 0.0] + - [42, 0.0] - - [8192, 64, 1, 1024] - - [52, 0.0] + - [50, 0.0] - - [8192, 64, 1, 3584] - - [80, 0.0] + - [78, 0.0] - - [8192, 128, 1, 1024] - - [21, 0.0] + - [19, 0.0] - - [8192, 128, 1, 3584] - - [79, 0.0] + - [77, 0.0] - - [512, 512, 1, 512] - - [65, 0.0] + - [63, 0.0] - - [1024, 1024, 1, 1024] - - [21, 0.0] + - [19, 0.0] - - [2048, 2048, 1, 2048] - - [55, 0.0] + - [53, 0.0] - - [4096, 4096, 1, 4096] - - [35, 0.0] + - [33, 0.0] - - [4096, 4096, 1, 8192] - - [86, 0.0] + - [84, 0.0] - - [8192, 8192, 1, 8192] - - [88, 0.0] + - [86, 0.0] - - [2, 7168, 1, 8192] - - [7, 0.0] + - [6, 0.0] - - [2, 3584, 1, 8192] - - [8, 0.0] + - [7, 0.0] - - [2, 1280, 1, 8192] - - [9, 0.0] + - [8, 0.0] - - [8192, 7168, 1, 8192] - - [87, 0.0] - - - [8192, 3584, 1, 8192] - [85, 0.0] + - - [8192, 3584, 1, 8192] + - [83, 0.0] - - [8192, 1280, 1, 8192] - - [84, 0.0] - - - [3584, 2, 1, 8192] - - [6, 0.0] - - - [1024, 2, 1, 8192] - - [10, 0.0] + - [82, 0.0] - - [152710, 2048, 1, 512] - - [11, 0.0] + - [9, 0.0] - - [6, 384, 1, 1536] - - [12, 0.0] + - [10, 0.0] - - [6, 3072, 1, 768] - - [13, 0.0] + - [11, 0.0] - - [462, 768, 1, 384] - - [14, 0.0] + - [12, 0.0] - - [462, 4096, 1, 384] - - [15, 0.0] + - [13, 0.0] - - [462, 1472, 1, 384] - - [16, 0.0] + - [14, 0.0] - - [6, 3072, 1, 1536] - - [12, 0.0] + - [10, 0.0] - - [462, 768, 1, 768] - - [17, 0.0] - - - [462, 4096, 1, 768] - [15, 0.0] + - - [462, 4096, 1, 768] + - [13, 0.0] - - [462, 1472, 1, 768] - - [16, 0.0] + - [14, 0.0] - - [6, 3072, 1, 3072] - - [18, 0.0] + - [16, 0.0] - - [462, 768, 1, 1536] - - [19, 0.0] + - [17, 0.0] - - [462, 4096, 1, 1536] - - [20, 0.0] + - [18, 0.0] - - [462, 1472, 1, 1536] - - [77, 0.0] + - [75, 0.0] - - [16384, 128, 1, 2304] - - [76, 0.0] + - [74, 0.0] - - [1152, 1152, 1, 1152] - - [22, 0.0] + - [20, 0.0] - - [2560, 2560, 1, 2560] - - [23, 0.0] + - [21, 0.0] - - [384, 1536, 1, 12288] - - [82, 0.0] + - [80, 0.0] - - [384, 6144, 1, 1536] - - [24, 0.0] + - [22, 0.0] - - [96, 1536, 1, 1536] - - [25, 0.0] + - [23, 0.0] - - [96, 1536, 1, 12288] - - [78, 0.0] + - [76, 0.0] - - [96, 6144, 1, 1536] - - [26, 0.0] + - [24, 0.0] - - [6144, 384, 1, 3072] - - [27, 0.0] + - [25, 0.0] - - [6144, 1536, 1, 384] - - [28, 0.0] + - [26, 0.0] - - [2048, 131072, 1, 16384] - - [29, 0.0] + - [27, 0.0] - - [1408, 1408, 1, 1408] - - [30, 0.0] + - [28, 0.0] - - [2688, 2688, 1, 2688] - - [31, 0.0] + - [29, 0.0] - - [3712, 3712, 1, 3712] - - [32, 0.0] + - [30, 0.0] - - [3840, 3840, 1, 3840] - - [33, 0.0] + - [31, 0.0] - - [3968, 3968, 1, 3968] - - [34, 0.0] + - [32, 0.0] - - [3328, 3328, 1, 3328] - - [36, 0.0] + - [34, 0.0] - - [4000, 8192, 1, 8192] - - [37, 0.0] + - [35, 0.0] - - [64896, 2048, 1, 512] - - [38, 0.0] + - [36, 0.0] - - [6144, 384, 1, 384] - - [39, 0.0] + - [37, 0.0] - - [16384, 32768, 1, 2304] - - [40, 0.0] + - [38, 0.0] - - [16384, 128, 1, 13312] - - [81, 0.0] + - [79, 0.0] - - [1536, 1536, 1, 1536] - - [41, 0.0] + - [39, 0.0] - - [1664, 1664, 1, 1664] - - [42, 0.0] + - [40, 0.0] - - [2816, 2816, 1, 2816] - - [43, 0.0] + - [41, 0.0] - - [1536, 768, 1, 768] - - [45, 0.0] + - [43, 0.0] - - [1536, 768, 1, 6144] - - [46, 0.0] + - [44, 0.0] - - [1536, 3072, 1, 768] - - [47, 0.0] + - [45, 0.0] - - [384, 1536, 1, 1536] - - [83, 0.0] + - [81, 0.0] - - [2048, 65536, 1, 16384] - - [29, 0.0] + - [27, 0.0] - - [2048, 8192, 1, 16384] - - [48, 0.0] + - [46, 0.0] - - [1792, 1792, 1, 1792] - - [49, 0.0] + - [47, 0.0] - - [1920, 1920, 1, 1920] - - [50, 0.0] + - [48, 0.0] - - [2944, 2944, 1, 2944] - - [51, 0.0] + - [49, 0.0] - - [16384, 8, 1, 13312] - - [53, 0.0] + - [51, 0.0] - - [16384, 4096, 1, 13312] - - [54, 0.0] + - [52, 0.0] - - [2176, 2176, 1, 2176] - - [56, 0.0] + - [54, 0.0] - - [3072, 3072, 1, 3072] - - [57, 0.0] + - [55, 0.0] - - [2048, 32768, 1, 16384] - - [59, 0.0] + - [57, 0.0] - - [2304, 2304, 1, 2304] - - [60, 0.0] + - [58, 0.0] - - [2432, 2432, 1, 2432] - - [61, 0.0] + - [59, 0.0] - - [3200, 3200, 1, 3200] - - [62, 0.0] + - [60, 0.0] - - [256, 256, 1, 256] - - [63, 0.0] + - [61, 0.0] - - [384, 384, 1, 384] - - [64, 0.0] + - [62, 0.0] - - [640, 640, 1, 640] - - [66, 0.0] + - [64, 0.0] - - [768, 768, 1, 768] - - [67, 0.0] + - [65, 0.0] - - [896, 896, 1, 896] - - [68, 0.0] + - [66, 0.0] - - [3456, 3456, 1, 3456] - - [69, 0.0] + - [67, 0.0] - - [3584, 3584, 1, 3584] - - [70, 0.0] + - [68, 0.0] - - [16384, 2, 1, 2304] - - [71, 0.0] + - [69, 0.0] - - [4000, 2, 1, 8192] - - [73, 0.0] + - [71, 0.0] - - [5560, 1024, 1, 2780] - - [74, 0.0] + - [72, 0.0] - - [2780, 1024, 1, 5560] - - [75, 0.0] + - [73, 0.0] - - [12288, 2048, 1, 4096] - - [89, 0.0] + - [87, 0.0] - - [22016, 2048, 1, 4096] - - [90, 0.0] + - [88, 0.0] - - [32000, 2048, 1, 4096] - - [90, 0.0] + - [88, 0.0] - - [4096, 2048, 1, 4096] - - [91, 0.0] + - [89, 0.0] - - [4096, 65536, 1, 11008] - - [92, 0.0] + - [90, 0.0] - - [4096, 65536, 1, 32000] - - [93, 0.0] + - [91, 0.0] - - [4096, 49152, 1, 32000] - - [117, 0.0] + - [115, 0.0] - - [1024, 2048, 1, 16256] - - [94, 0.0] + - [92, 0.0] - - [512, 331051, 1, 1536] - - [95, 0.0] + - [93, 0.0] - - [512, 395225, 1, 1024] - - [96, 0.0] + - [94, 0.0] - - [7680, 16, 1, 6144] - - [97, 0.0] + - [95, 0.0] - - [6144, 16, 1, 19648] - - [98, 0.0] + - [96, 0.0] - - [39296, 16, 1, 6144] - - [99, 0.0] + - [97, 0.0] - - [39296, 8, 1, 6144] - - [100, 0.0] - - - [6144, 8, 1, 19648] - [98, 0.0] + - - [6144, 8, 1, 19648] + - [96, 0.0] - - [7680, 8, 1, 6144] - - [101, 0.0] + - [99, 0.0] - - [1024, 2885, 1, 1024] - - [102, 0.0] + - [100, 0.0] - - [1024, 2885, 1, 4096] - - [102, 0.0] + - [100, 0.0] - - [3072, 1875, 1, 3072] - - [103, 0.0] + - [101, 0.0] - - [3072, 1875, 1, 8192] - - [104, 0.0] + - [102, 0.0] - - [3072, 757, 1, 3072] - - [105, 0.0] + - [103, 0.0] - - [3072, 757, 1, 4096] - - [105, 0.0] + - [103, 0.0] - - [4096, 2885, 1, 1024] - - [106, 0.0] + - [104, 0.0] - - [16384, 1875, 1, 3072] - - [107, 0.0] + - [105, 0.0] - - [32064, 1875, 1, 3072] - - [108, 0.0] + - [106, 0.0] - - [9216, 1875, 1, 3072] - - [109, 0.0] + - [107, 0.0] - - [2048, 1152, 1, 2048] - - [110, 0.0] + - [108, 0.0] - - [1024, 4608, 1, 1024] - - [111, 0.0] + - [109, 0.0] - - [2048, 462, 1, 4096] - - [112, 0.0] + - [110, 0.0] - - [512, 18432, 1, 512] - - [113, 0.0] + - [111, 0.0] - - [512, 18432, 1, 2048] - - [114, 0.0] + - [112, 0.0] - - [2048, 308, 1, 1472] - - [115, 0.0] + - [113, 0.0] - - [50304, 51200, 1, 1600] - - [116, 0.0] + - [114, 0.0] - - [8192, 57344, 1, 32000] - - [117, 0.0] + - [115, 0.0] - - [8192, 106496, 1, 32000] - - [117, 0.0] + - [115, 0.0] - - [8192, 2048, 1, 1024] - - [118, 0.0] + - [116, 0.0] - - [1280, 2048, 1, 8192] - - [119, 0.0] + - [117, 0.0] - - [1280, 1024, 1, 8192] - - [120, 0.0] + - [118, 0.0] - - [8192, 4096, 1, 1024] - - [121, 0.0] + - [119, 0.0] - - [1280, 8192, 1, 8192] - - [122, 0.0] + - [120, 0.0] - - [1280, 4096, 1, 8192] - - [123, 0.0] + - [121, 0.0] - - [8192, 1024, 1, 1024] - - [124, 0.0] + - [122, 0.0] - - [8192, 8192, 1, 1024] - - [125, 0.0] + - [123, 0.0] - - [1024, 1024, 48, 128] - - [126, 105.88] + - [124, 105.88] - - [1357, 1024, 48, 128] - - [127, 93.98] + - [125, 93.98] - - [256, 256, 64, 64] - - [128, 40.77] + - [126, 40.77] - - [256, 796032, 1, 1024] - - [129, 0.0] + - [127, 0.0] - - [222, 5905, 1, 1024] - - [130, 0.0] + - [128, 0.0] - - [256, 5905, 1, 128] - - [131, 0.0] + - [129, 0.0] - - [1024, 5905, 1, 6144] - - [132, 0.0] + - [130, 0.0] - - [4096, 5905, 1, 6144] - - [133, 0.0] + - [131, 0.0] - - [1024, 5905, 1, 11715] - - [134, 0.0] + - [132, 0.0] - - [2048, 5905, 1, 5120] - - [135, 0.0] + - [133, 0.0] - - [11715, 5905, 1, 1024] - - [136, 0.0] + - [134, 0.0] - - [512, 5905, 1, 512] - - [137, 0.0] + - [135, 0.0] - - [256, 5905, 1, 1024] - - [138, 0.0] + - [136, 0.0] - - [256, 1024, 1, 256] - - [139, 0.0] + - [137, 0.0] - - [2048, 5905, 1, 1475] - - [140, 0.0] + - [138, 0.0] - - [2048, 5905, 1, 4440] - - [141, 0.0] + - [139, 0.0] - - [512, 5905, 1, 1475] - - [142, 0.0] + - [140, 0.0] - - [256, 795008, 1, 256] - - [143, 0.0] + - [141, 0.0] - - [1024, 5905, 1, 5120] - - [144, 0.0] + - [142, 0.0] - - [768, 796032, 1, 256] - - [145, 0.0] + - [143, 0.0] - - [24, 1511680, 1, 224] - - [146, 0.0] + - [144, 0.0] - - [5120, 5905, 1, 4096] - - [147, 0.0] + - [145, 0.0] - - [1475, 5905, 1, 1024] - - [148, 0.0] + - [146, 0.0] - - [512, 5905, 1, 256] - - [149, 0.0] + - [147, 0.0] - - [256, 222, 5905, 20] - - [150, 0.0] + - [148, 0.0] - - [1024, 5905, 1, 1475] - - [151, 0.0] + - [149, 0.0] - - [11144, 5905, 1, 128] - - [152, 0.0] + - [150, 0.0] - - [2048, 5905, 1, 4040] - - [153, 0.0] + - [151, 0.0] - - [256, 5905, 1, 64] - - [154, 0.0] + - [152, 0.0] - - [256, 202, 5905, 20] - - [155, 0.0] + - [153, 0.0] - - [202, 5905, 1, 1024] - - [156, 0.0] + - [154, 0.0] - - [128, 5905, 1, 1475] - - [157, 0.0] + - [155, 0.0] - - [256, 5905, 1, 512] - - [158, 0.0] + - [156, 0.0] - - [8, 5905, 1, 512] - - [159, 0.0] + - [157, 0.0] - - [256, 795008, 1, 512] - - [160, 0.0] + - [158, 0.0] - - [1475, 5905, 1, 128] - - [161, 0.0] + - [159, 0.0] - - [256, 795008, 1, 192] - - [162, 0.0] + - [160, 0.0] - - [128, 5905, 1, 1024] - - [163, 0.0] + - [161, 0.0] - - [5120, 5905, 1, 11715] - - [164, 0.0] + - [162, 0.0] - - [20, 1511680, 1, 198] - - [165, 0.0] + - [163, 0.0] - - [6144, 5905, 1, 1024] - - [166, 0.0] + - [164, 0.0] - - [1024, 5905, 1, 512] - - [167, 0.0] + - [165, 0.0] - - [20, 1511680, 1, 202] - - [168, 0.0] + - [166, 0.0] - - [8192, 2048, 1, 3584] - - [90, 0.0] + - [88, 0.0] - - [4096, 49152, 1, 128256] - - [124, 0.0] + - [122, 0.0] - - [4096, 49152, 1, 1024] - - [145, 0.0] + - [143, 0.0] - - [512, 24576, 1, 2048] - - [129, 0.0] + - [127, 0.0] - - [2048, 4112, 1, 3840] - - [169, 0.0] + - [167, 0.0] - - [384, 384, 16, 192] - - [154, 0.0] + - [152, 0.0] - - [2304, 528, 1, 576] - - [154, 0.0] + - [152, 0.0] - - [96, 21, 32, 96] - - [71, 0.0] + - [69, 0.0] - - [512, 24576, 1, 304] - - [170, 0.0] + - [168, 0.0] - - [576, 32, 1, 576] - - [171, 0.0] + - [169, 0.0] - - [576, 24576, 1, 576] - - [172, 0.0] + - [170, 0.0] - - [576, 4096, 1, 576] - - [173, 0.0] + - [171, 0.0] - - [576, 528, 1, 576] - - [64, 0.0] + - [62, 0.0] - - [1, 64, 1, 1] - - [99, 0.0] + - [97, 0.0] - - [8192, 4112, 1, 2048] - - [107, 0.0] + - [105, 0.0] - - [1728, 4096, 1, 576] - - [132, 0.0] + - [130, 0.0] - - [1728, 528, 1, 576] - - [115, 0.0] + - [113, 0.0] - - [21, 96, 32, 96] - - [13, 0.0] + - [11, 0.0] - - [304, 24576, 1, 512] - [4, 0.0] - - [3840, 24576, 1, 576] - - [174, 0.0] + - [172, 0.0] - - [1024, 16, 1, 1024] - - [12, 0.0] + - [10, 0.0] - - [1024, 16, 1, 512] - - [63, 0.0] + - [61, 0.0] - - [2048, 4112, 1, 2048] - - [114, 0.0] + - [112, 0.0] - - [576, 32, 1, 2304] - - [175, 0.0] + - [173, 0.0] - - [2048, 24576, 1, 512] - - [96, 0.0] + - [94, 0.0] - - [96, 96, 32, 96] - - [139, 0.0] + - [137, 0.0] - - [1152, 4096, 1, 576] - - [111, 0.0] + - [109, 0.0] - - [576, 528, 1, 2304] - - [176, 0.0] + - [174, 0.0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml index bb1d4312f1f..c703bdb7b72 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_80cu/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH_UserArgs.yaml @@ -17,6 +17,7 @@ ComputeDataType: 0 DataType: 7 DataTypeA: 7 + DataTypeAmaxD: 0 DataTypeB: 7 DataTypeE: 7 DestDataType: 7 @@ -48,13 +49,17 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + OutputAmaxD: false SetConstStrideA: [] SetConstStrideB: [] SetConstStrideBias: [] SilentHighPrecisionAccumulate: false Sparse: 0 + StochasticRounding: false StridedBatched: true SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false TLUA: false TLUB: false Tensor0: 0 @@ -77,6 +82,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -87,21 +94,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -113,7 +127,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_LBSPPA1024_LBSPPB128_LPA16_LPB16_LRVW8_MIWT8_4_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -175,7 +190,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -186,6 +203,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 16 @@ -208,7 +226,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LBSPPB128_LPA16_LPB16_LRVW8_MIWT8_4_SU0_SUM0_SUS0_SVW8_VWA8_WG32_8_1_WGM40 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM40_WGMXCC1_WGMXCCGn1 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -217,6 +235,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -229,6 +250,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -242,10 +264,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 40 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -262,6 +286,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -272,21 +298,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -298,7 +331,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_LBSPPA1024_LBSPPB128_LPA16_LPB16_LRVW8_MIWT8_4_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -360,7 +394,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -371,6 +407,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 16 @@ -393,7 +430,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LBSPPB128_LPA16_LPB16_LRVW8_MIWT8_4_SU8_SUM0_SUS256_SVW8_VWA8_WG32_8_1_WGM40 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM40_WGMXCC1_WGMXCCGn1 SourceSwap: true StaggerU: 8 StaggerUMapping: 0 @@ -402,6 +439,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -414,6 +454,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -427,10 +468,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 40 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -447,6 +490,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -457,21 +502,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -483,7 +535,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_GRVWA4_GRVWB4_LBSPPA256_LBSPPB128_LPA4_LPB4_LRVW4_MIWT14_4_SVW2_VWA2_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -545,7 +598,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -556,6 +611,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 112 @@ -578,7 +634,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_GRVWA4_GRVWB4_GSU1_LBSPPA256_LBSPPB128_LPA4_LPB4_LRVW4_MIWT14_4_SU8_SUM0_SUS256_SVW2_VWA2_WG16_16_1_WGM0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: true StaggerU: 8 StaggerUMapping: 0 @@ -587,6 +643,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -599,6 +658,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -612,10 +672,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -632,6 +694,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -642,21 +706,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -668,7 +739,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x224x64_MI16x16x1_SN_GRVWA4_GRVWB4_LBSPPA128_LBSPPB128_LPA4_LPB4_LRVW4_MIWT7_7_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -730,7 +802,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -741,6 +815,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 196 NumGlobalWriteVectorsPerThread: 196 @@ -763,7 +838,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x224x64_MI16x16x1_SN_GRVWA4_GRVWB4_GSU1_LBSPPA128_LBSPPB128_LPA4_LPB4_LRVW4_MIWT7_7_SU8_SUM0_SUS256_SVW1_VWA1_WG32_8_1_WGM0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: true StaggerU: 8 StaggerUMapping: 0 @@ -772,6 +847,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -784,6 +862,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -797,10 +876,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -817,6 +898,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -827,21 +910,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -852,7 +942,8 @@ InterleaveAlpha: 0 InternalSupportParams: {SupportCustomWGM: true, SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x192x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_6_SU4_SUS512_SVW8_VWA8_WSGRB0_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -914,7 +1005,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -925,6 +1018,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 24 @@ -947,7 +1041,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x192x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_6_SU4_SUS512_SVW8_VWA8_WSGRB0_WG32_8_1_WGM1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM2_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 StaggerUMapping: 2 @@ -956,6 +1050,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -968,6 +1065,7 @@ ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -981,9 +1079,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -1000,6 +1101,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -1010,21 +1113,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -1035,7 +1145,8 @@ InterleaveAlpha: 0 InternalSupportParams: {SupportCustomWGM: true, SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x192x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_6_SU8_SUS256_SVW8_VWA8_WSGRB0_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -1097,7 +1208,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -1108,6 +1221,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 24 @@ -1130,7 +1244,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x192x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_6_SU8_SUS256_SVW8_VWA8_WSGRB0_WG32_8_1_WGM40 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM2_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM40_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 2 @@ -1139,6 +1253,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -1151,6 +1268,7 @@ ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -1164,9 +1282,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 40 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -1183,6 +1304,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -1193,21 +1316,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -1218,7 +1348,8 @@ InterleaveAlpha: 0 InternalSupportParams: {SupportCustomWGM: true, SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB4_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_9_SU8_SUS512_SVW4_VWA4_WSGRB0_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x144x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -1280,7 +1411,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -1291,6 +1424,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 144 NumGlobalWriteVectorsPerThread: 36 @@ -1313,7 +1447,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB4_GSU1_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_9_SU8_SUS512_SVW4_VWA4_WSGRB0_WG64_4_1_WGM40 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x144x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM2_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM40_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 2 @@ -1322,6 +1456,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -1334,6 +1471,7 @@ ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -1347,9 +1485,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 40 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -1366,6 +1507,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -1376,21 +1519,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -1401,7 +1551,8 @@ InterleaveAlpha: 0 InternalSupportParams: {SupportCustomWGM: true, SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x256x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_8_SU8_SUS256_SVW4_VWA4_WSGRB1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -1463,7 +1614,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -1474,6 +1627,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 32 @@ -1496,7 +1650,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x256x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_8_SU8_SUS256_SVW4_VWA4_WSGRB1_WG32_8_1_WGM40 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM2_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB1_WS64_WG32_8_1_WGM40_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 2 @@ -1505,6 +1659,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -1517,6 +1674,7 @@ ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -1530,9 +1688,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 40 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -1549,6 +1710,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -1559,21 +1722,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -1585,7 +1755,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_GRVWA4_GRVWB4_LBSPPA512_LBSPPB128_LPA4_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -1647,7 +1818,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -1658,6 +1831,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 56 @@ -1680,7 +1854,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_GRVWA4_GRVWB4_GSU1_LBSPPA512_LBSPPB128_LPA4_LPB4_LRVW4_MIWT4_14_SU8_SUM0_SUS256_SVW4_VWA4_WG64_4_1_WGM40 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM40_WGMXCC1_WGMXCCGn1 SourceSwap: true StaggerU: 8 StaggerUMapping: 0 @@ -1689,6 +1863,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -1701,6 +1878,7 @@ ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -1714,10 +1892,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 40 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -1734,8 +1914,10 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 2 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -1744,22 +1926,29 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default - CustomKernelName: Custom_Cijk_Alik_Bljk_BBS_BH_MT128x16x128_MI16x16x1_SN_GSUM_K1_MIWT2_1_01 - DepthU: 128 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 4 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1767,44 +1956,44 @@ ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomWGM: true, SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x16x128_MI16x16x1_SN_GSUM_MIWT2_1 - LSCA: 128 - LSCB: 128 - LSPA: 4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA1_WSGRB2_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 8 LSPB: 4 - LVCA: 16 + LVCA: 8 LVCB: 16 LVPA: 1 LVPB: 1 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 19712 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 2304 + LdsNumElements: 32128 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 19712 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 32128 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 8 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: true MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -1815,14 +2004,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -1831,7 +2020,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -1842,15 +2033,16 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 NumLoadsA: 8 - NumLoadsB: 1 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 14 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -1864,198 +2056,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x16x128_MI16x16x1_SN_GSU4_MIWT2_1_WGM1 - SourceSwap: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_14_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM2_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA1_WSGRB2_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 StaggerU: 4 StaggerUMapping: 2 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 4] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 8 - AssertSummationElementMultiple: 32 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA512_LBSPPB128_LPA8_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 8 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumElements: 32128 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 15232 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32128 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 8 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: 0 - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] - MIWaveTileA: 4 - MIWaveTileB: 14 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 14 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA512_LBSPPB128_LPA8_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 - SourceSwap: 1 - StaggerU: 4 - StaggerUMapping: 2 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -2068,6 +2080,7 @@ ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -2081,9 +2094,12 @@ WaveSeparateGlobalReadA: 1 WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [0, 0] _DepthU: 64 @@ -2100,6 +2116,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 2 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -2110,21 +2128,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 10 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -2135,7 +2160,8 @@ InterleaveAlpha: 0 InternalSupportParams: {SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT16x128x64_MI16x16x1_SN_LDSB1_LBSPPA128_LBSPPB128_MIWT1_2_NTB0_NLCA1_SU0_SUM0_SUS0_SPO1_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM2_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB1_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 4 @@ -2197,7 +2223,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -2208,6 +2236,7 @@ NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 @@ -2229,8 +2258,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT16x128x64_MI16x16x1_SN_LDSB1_GSU10_LBSPPA128_LBSPPB128_MIWT1_2_NTB0_NLCA1_SU0_SUM0_SUS0_SPO1_WG16_16_1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM2_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU10_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB1_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -2239,6 +2268,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -2251,6 +2283,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -2264,9 +2297,12 @@ WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 10] _DepthU: 64 @@ -2283,6 +2319,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 2 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -2293,21 +2331,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 8 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -2318,7 +2363,8 @@ InterleaveAlpha: 0 InternalSupportParams: {SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT16x32x64_MI16x16x1_SN_LDSB0_LBSPPA128_LBSPPB128_MIWT1_1_NTB0_NLCA1_SU4_SUM2_SUS512_SPO1_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM2_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB1_WS64_WG16_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 2 @@ -2380,7 +2426,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -2391,6 +2439,7 @@ NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -2412,8 +2461,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT16x32x64_MI16x16x1_SN_LDSB0_GSU8_LBSPPA128_LBSPPB128_MIWT1_1_NTB0_NLCA1_SU4_SUM2_SUS512_SPO1_WG16_8_1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM2_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU8_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM2_SUS512_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB1_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 StaggerUMapping: 2 @@ -2422,6 +2471,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 32 SubGroupA: 4 @@ -2434,6 +2486,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -2447,9 +2500,12 @@ WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 8, 1] WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 8] _DepthU: 64 @@ -2466,6 +2522,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 2 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -2476,21 +2534,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 15 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -2501,7 +2566,8 @@ InterleaveAlpha: 0 InternalSupportParams: {SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT16x32x64_MI16x16x1_SN_LDSB0_LBSPPA128_LBSPPB128_MIWT1_1_NTB1_NLCA1_SU4_SUM2_SUS256_SPO1_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM2_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB1_WS64_WG16_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 2 @@ -2563,7 +2629,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -2574,6 +2642,7 @@ NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -2595,8 +2664,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT16x32x64_MI16x16x1_SN_LDSB0_GSU15_LBSPPA128_LBSPPB128_MIWT1_1_NTB1_NLCA1_SU4_SUM2_SUS256_SPO1_WG16_8_1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM2_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU15_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM2_SUS256_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA2_WSGRB1_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 StaggerUMapping: 2 @@ -2605,6 +2674,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 32 SubGroupA: 4 @@ -2617,6 +2689,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -2630,9 +2703,12 @@ WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 8, 1] WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 15] _DepthU: 64 @@ -2649,6 +2725,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -2659,21 +2737,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -2684,7 +2769,8 @@ InterleaveAlpha: 0 InternalSupportParams: {SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_LBSPPA1024_LBSPPB128_MIWT8_7_SU0_SUM0_SUS0_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA2_WSGRB1_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 4 @@ -2746,7 +2832,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -2757,6 +2845,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 28 @@ -2778,8 +2867,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA1024_LBSPPB128_MIWT8_7_SU0_SUM0_SUS0_SVW8_VWA8_WG32_8_1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA2_WSGRB1_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -2788,6 +2877,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -2800,6 +2892,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -2813,9 +2906,12 @@ WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -2832,6 +2928,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -2842,21 +2940,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -2867,7 +2972,8 @@ InterleaveAlpha: 0 InternalSupportParams: {SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_LBSPPA1024_LBSPPB128_MIWT8_7_SU4_SUM2_SUS256_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA2_WSGRB1_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 4 @@ -2929,7 +3035,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -2940,6 +3048,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 28 @@ -2961,8 +3070,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA1024_LBSPPB128_MIWT8_7_SU4_SUM2_SUS256_SVW8_VWA8_WG32_8_1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM2_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA2_WSGRB1_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 StaggerUMapping: 2 @@ -2971,6 +3080,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -2983,6 +3095,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -2996,9 +3109,12 @@ WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -3015,6 +3131,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -3025,21 +3143,28 @@ CUCount: null ClusterLocalRead: 1 CodeObjectVersion: default + ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -3050,7 +3175,8 @@ InterleaveAlpha: 0 InternalSupportParams: {SupportUserGSU: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB4_LBSPPA512_LBSPPB128_MIWT4_9_SU0_SUM0_SUS0_SVW4_VWA4_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x144x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA2_WSGRB1_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 4 @@ -3112,7 +3238,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -3123,6 +3251,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 144 NumGlobalWriteVectorsPerThread: 36 @@ -3144,8 +3273,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA512_LBSPPB128_MIWT4_9_SU0_SUM0_SUS0_SVW4_VWA4_WG64_4_1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x144x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA2_WSGRB1_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -3154,6 +3283,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -3166,6 +3298,7 @@ ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -3179,9 +3312,12 @@ WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -3194,194 +3330,12 @@ _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 2 - AssertSummationElementMultiple: 32 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - CustomKernelName: Custom_Cijk_Alik_Bljk_BBS_BH_MT16x16x128_MI16x16x1_SN_GSUM_K1_MIWT1_1_3 - DepthU: 128 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 7 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomWGM: true, SupportUserGSU: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT16x16x128_MI16x16x1_SN_GSUM_MIWT1_1 - LSCA: 128 - LSCB: 128 - LSPA: 1 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumElements: 12800 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: 0.4 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 - MFMA_BF16_1K: true - MIArchVgpr: 0 - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT16x16x128_MI16x16x1_SN_GSU7_MIWT1_1_WGM1 - SourceSwap: 0 - StaggerU: 4 - StaggerUMapping: 2 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 7] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -3394,10 +3348,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -3409,6 +3366,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -3420,7 +3379,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA256_MIWT14_4_SS1_SVW2_VWA2_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -3482,7 +3442,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -3493,6 +3455,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 112 @@ -3514,8 +3477,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA256_MIWT14_4_SS1_SU8_SUM1_SUS128_SVW2_VWA2_WG16_16_1_WGM0 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT14_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -3524,6 +3487,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -3536,6 +3502,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -3549,10 +3516,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -3569,6 +3538,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -3581,20 +3552,26 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -3606,7 +3583,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA1024_LBSPPB1024_LPA16_LPB16_LRVW8_MIWT1_1_SVW1_VWA1_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false LSCA: 512 LSCB: 512 LSPA: 2 @@ -3668,7 +3646,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -3679,6 +3659,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -3700,8 +3681,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LBSPPB1024_LPA16_LPB16_LRVW8_MIWT1_1_SU0_SUM0_SUS0_SVW1_VWA1_WG32_4_1_WGM0 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -3710,6 +3691,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -3722,6 +3706,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -3735,10 +3720,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 512 @@ -3755,6 +3742,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -3767,20 +3756,26 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 + ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -3792,7 +3787,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIWT1_1_SVW1_VWA1_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 4 @@ -3854,7 +3850,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -3865,6 +3863,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -3886,8 +3885,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIWT1_1_SU0_SUM0_SUS0_SVW1_VWA1_WG32_4_1_WGM0 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -3896,6 +3895,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -3908,6 +3910,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -3921,10 +3924,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 256 @@ -3941,6 +3946,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -3953,10 +3960,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -3968,6 +3978,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -3979,7 +3991,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_1_SS1_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -4041,7 +4054,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -4052,6 +4067,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -4073,8 +4089,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_1_SS1_SU8_SUM1_SUS256_SVW1_VWA1_WG32_8_1_WGM8 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -4083,6 +4099,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -4095,6 +4114,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -4108,10 +4128,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -4128,6 +4150,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -4140,10 +4164,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -4155,6 +4182,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -4166,7 +4195,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -4228,7 +4258,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -4239,6 +4271,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 @@ -4260,8 +4293,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS1_SU0_SUM0_SUS0_SVW2_VWA2_WG32_8_1_WGM8 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -4270,6 +4303,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -4282,6 +4318,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -4295,10 +4332,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -4315,6 +4354,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -4327,10 +4368,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -4342,6 +4386,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -4353,7 +4399,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_3_SS1_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -4415,7 +4462,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -4426,6 +4475,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 12 NumGlobalWriteVectorsPerThread: 12 @@ -4447,8 +4497,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_3_SS1_SU8_SUM1_SUS256_SVW1_VWA1_WG32_8_1_WGM8 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -4457,6 +4507,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -4469,6 +4522,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -4482,10 +4536,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -4502,6 +4558,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -4514,10 +4572,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -4529,6 +4590,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -4540,7 +4603,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_SS0_SVW8_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -4602,7 +4666,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -4613,6 +4679,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 @@ -4634,8 +4701,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_SS0_SU8_SUM1_SUS256_SVW8_VWA2_WG32_8_1_WGM8 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -4644,6 +4711,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -4656,6 +4726,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -4669,10 +4740,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -4689,6 +4762,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -4701,10 +4776,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -4716,6 +4794,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -4727,7 +4807,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA1024_LBSPPB1024_LPA16_LPB16_LRVW8_MIWT1_1_SS0_SVW4_VWA1_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false LSCA: 512 LSCB: 512 LSPA: 2 @@ -4789,7 +4870,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -4800,6 +4883,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -4821,8 +4905,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LBSPPB1024_LPA16_LPB16_LRVW8_MIWT1_1_SS0_SU8_SUM1_SUS1024_SVW4_VWA1_WG32_4_1_WGM0 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -4831,6 +4915,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -4843,6 +4930,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -4856,10 +4944,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 512 @@ -4876,6 +4966,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -4888,10 +4980,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -4903,6 +4998,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -4914,7 +5011,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIWT1_2_SS1_SVW1_VWA1_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 8 @@ -4976,7 +5074,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -4987,6 +5087,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 @@ -5008,8 +5109,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIWT1_2_SS1_SU0_SUM0_SUS0_SVW1_VWA1_WG64_4_1_WGM8 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -5018,6 +5119,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -5030,6 +5134,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -5043,10 +5148,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 256 @@ -5063,6 +5170,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -5075,10 +5184,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -5090,6 +5202,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -5101,7 +5215,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS1_SVW2_VWA2_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -5163,7 +5278,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -5174,6 +5291,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 @@ -5195,8 +5313,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS1_SU8_SUM1_SUS1024_SVW2_VWA2_WG64_4_1_WGM8 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -5205,6 +5323,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -5217,6 +5338,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -5230,10 +5352,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -5250,6 +5374,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -5262,10 +5388,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -5277,6 +5406,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -5288,7 +5419,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_3_SS1_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -5350,7 +5482,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -5361,6 +5495,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 12 NumGlobalWriteVectorsPerThread: 12 @@ -5382,8 +5517,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_3_SS1_SU8_SUM1_SUS512_SVW1_VWA1_WG32_8_1_WGM8 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -5392,6 +5527,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -5404,6 +5542,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -5417,10 +5556,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -5437,6 +5578,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -5449,10 +5592,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -5464,6 +5610,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -5475,7 +5623,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_2_SS1_SVW4_VWA4_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -5537,7 +5686,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -5548,6 +5699,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 @@ -5569,8 +5721,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_2_SS1_SU8_SUM1_SUS1024_SVW4_VWA4_WG16_16_1_WGM0 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -5579,6 +5731,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -5591,6 +5746,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -5604,10 +5760,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -5624,6 +5782,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -5636,10 +5796,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -5651,6 +5814,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -5662,7 +5827,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_2_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -5724,7 +5890,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -5735,6 +5903,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 @@ -5756,8 +5925,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_2_SS1_SU8_SUM1_SUS256_SVW2_VWA2_WG32_8_1_WGM0 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -5766,6 +5935,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -5778,6 +5950,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -5791,10 +5964,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -5811,6 +5986,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -5823,10 +6000,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -5838,6 +6018,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -5849,7 +6031,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_3_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -5911,7 +6094,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -5922,6 +6107,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 @@ -5943,8 +6129,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_3_SS1_SU8_SUM1_SUS1024_SVW2_VWA2_WG32_8_1_WGM0 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -5953,6 +6139,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -5965,6 +6154,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -5978,10 +6168,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -5998,6 +6190,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -6010,10 +6204,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -6025,6 +6222,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -6036,7 +6235,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_6_SS1_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -6098,7 +6298,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -6109,6 +6311,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 96 NumGlobalWriteVectorsPerThread: 24 @@ -6130,8 +6333,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_6_SS1_SU8_SUM1_SUS512_SVW4_VWA4_WG32_8_1_WGM0 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -6140,6 +6343,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -6152,6 +6358,7 @@ ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -6165,10 +6372,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -6185,6 +6394,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -6197,10 +6408,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -6212,6 +6426,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -6223,7 +6439,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIWT1_2_SS1_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 8 @@ -6285,7 +6502,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -6296,6 +6515,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 @@ -6317,8 +6537,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 33 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIWT1_2_SS1_SU0_SUM0_SUS0_SVW1_VWA1_WG32_8_1_WGM8 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -6327,6 +6547,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -6339,6 +6562,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -6352,10 +6576,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 256 @@ -6372,7 +6598,9 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AssertFree0ElementMultiple: 8 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true @@ -6384,10 +6612,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -6399,6 +6630,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -6410,7 +6643,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS0_SVW8_VWA2_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -6472,7 +6706,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -6483,6 +6719,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 @@ -6504,8 +6741,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 34 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS0_SU0_SUM0_SUS0_SVW8_VWA2_WG64_4_1_WGM8 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 StaggerUMapping: 0 @@ -6514,6 +6751,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -6526,6 +6766,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -6539,10 +6780,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -6559,6 +6802,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -6571,10 +6816,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -6586,6 +6834,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -6597,7 +6847,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA1024_LBSPPB1024_LPA16_LPB16_LRVW8_MIWT1_1_SS1_SVW1_VWA1_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false LSCA: 512 LSCB: 512 LSPA: 2 @@ -6659,7 +6910,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -6670,6 +6923,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -6691,8 +6945,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 35 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LBSPPB1024_LPA16_LPB16_LRVW8_MIWT1_1_SS1_SU0_SUM0_SUS0_SVW1_VWA1_WG32_4_1_WGM8 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -6701,6 +6955,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -6713,6 +6970,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -6726,10 +6984,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 512 @@ -6746,6 +7006,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -6758,10 +7020,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -6773,6 +7038,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -6784,7 +7051,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA1024_LBSPPB1024_LPA16_LPB16_LRVW8_MIWT1_1_SS0_SVW4_VWA1_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false LSCA: 512 LSCB: 512 LSPA: 2 @@ -6846,7 +7114,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -6857,6 +7127,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -6878,8 +7149,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 36 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LBSPPB1024_LPA16_LPB16_LRVW8_MIWT1_1_SS0_SU0_SUM0_SUS0_SVW4_VWA1_WG16_8_1_WGM8 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 StaggerUMapping: 0 @@ -6888,6 +7159,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 32 SubGroupA: 4 @@ -6900,6 +7174,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -6913,10 +7188,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 512 @@ -6933,6 +7210,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -6945,10 +7224,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -6960,6 +7242,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -6971,7 +7255,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA1024_LBSPPB512_LPA16_LPB16_LRVW8_MIWT2_1_SS0_SVW8_VWA2_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 8 @@ -7033,7 +7318,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -7044,6 +7331,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 @@ -7065,8 +7353,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 37 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LBSPPB512_LPA16_LPB16_LRVW8_MIWT2_1_SS0_SU0_SUM0_SUS0_SVW8_VWA2_WG16_16_1_WGM8 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 StaggerUMapping: 0 @@ -7075,6 +7363,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -7087,6 +7378,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -7100,10 +7392,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 256 @@ -7120,6 +7414,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -7132,10 +7428,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -7147,6 +7446,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -7158,7 +7459,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT3_3_SS1_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -7220,7 +7522,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -7231,6 +7535,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 36 NumGlobalWriteVectorsPerThread: 36 @@ -7252,8 +7557,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 38 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT3_3_SS1_SU8_SUM1_SUS512_SVW1_VWA1_WG32_8_1_WGM0 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -7262,6 +7567,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -7274,6 +7582,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -7287,10 +7596,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -7307,6 +7618,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -7319,10 +7632,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -7334,6 +7650,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -7345,7 +7663,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_8_SS1_SVW4_VWA4_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -7407,7 +7726,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -7418,6 +7739,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 32 @@ -7439,8 +7761,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 39 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_8_SS1_SU32_SUM1_SUS128_SVW4_VWA4_WG64_4_1_WGM0 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 StaggerUMapping: 1 @@ -7449,6 +7771,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -7461,6 +7786,7 @@ ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -7474,10 +7800,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -7494,6 +7822,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -7506,10 +7836,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -7521,6 +7854,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -7532,7 +7867,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA1024_MIWT8_6_SS1_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -7594,7 +7930,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -7605,6 +7943,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 24 @@ -7626,8 +7965,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 40 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA1024_MIWT8_6_SS1_SU0_SUM0_SUS0_SVW8_VWA8_WG32_8_1_WGM8 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -7636,6 +7975,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -7648,6 +7990,7 @@ ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -7661,10 +8004,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -7681,6 +8026,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -7693,10 +8040,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -7708,6 +8058,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -7719,7 +8071,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_3_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -7781,7 +8134,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -7792,6 +8147,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 @@ -7813,8 +8169,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 41 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_3_SS1_SU8_SUM1_SUS512_SVW2_VWA2_WG32_8_1_WGM0 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -7823,6 +8179,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -7835,6 +8194,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -7848,10 +8208,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -7868,6 +8230,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -7880,10 +8244,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -7895,6 +8262,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -7906,7 +8275,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS0_SVW8_VWA2_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -7968,7 +8338,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -7979,6 +8351,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 @@ -8000,8 +8373,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 42 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS0_SU8_SUM1_SUS512_SVW8_VWA2_WG64_4_1_WGM8 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -8010,6 +8383,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -8022,6 +8398,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -8035,10 +8412,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -8055,6 +8434,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -8067,10 +8448,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -8082,6 +8466,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -8093,7 +8479,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_6_SS1_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -8155,7 +8542,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -8166,6 +8555,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 96 NumGlobalWriteVectorsPerThread: 24 @@ -8187,8 +8577,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 43 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_6_SS1_SU32_SUM1_SUS1024_SVW4_VWA4_WG32_8_1_WGM8 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 StaggerUMapping: 1 @@ -8197,6 +8587,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -8209,6 +8602,7 @@ ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -8222,10 +8616,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -8242,6 +8638,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -8254,10 +8652,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -8269,6 +8670,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -8280,7 +8683,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_LBSPPA256_MIWT6_8_SS0_SVW8_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -8342,7 +8746,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -8353,6 +8759,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 96 @@ -8374,8 +8781,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 44 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_GSU1_LBSPPA256_MIWT6_8_SS0_SU8_SUM1_SUS512_SVW8_VWA2_WG32_8_1_WGM0 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -8384,6 +8791,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -8396,6 +8806,7 @@ ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -8409,10 +8820,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -8429,6 +8842,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -8441,10 +8856,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -8456,6 +8874,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -8467,7 +8887,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_LBSPPA256_MIWT6_8_SS0_SVW8_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -8529,7 +8950,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -8540,6 +8963,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 96 @@ -8561,8 +8985,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 45 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_GSU1_LBSPPA256_MIWT6_8_SS0_SU8_SUM1_SUS1024_SVW8_VWA2_WG32_8_1_WGM0 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -8571,6 +8995,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -8583,6 +9010,7 @@ ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -8596,10 +9024,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -8616,6 +9046,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -8628,10 +9060,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -8643,6 +9078,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -8654,7 +9091,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA256_MIWT6_9_SS0_SVW8_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -8716,7 +9154,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -8727,6 +9167,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 216 NumGlobalWriteVectorsPerThread: 108 @@ -8748,8 +9189,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 46 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA256_MIWT6_9_SS0_SU8_SUM1_SUS1024_SVW8_VWA2_WG32_8_1_WGM8 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x288x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -8758,6 +9199,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -8770,6 +9214,7 @@ ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -8783,10 +9228,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -8803,6 +9250,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -8815,10 +9264,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -8830,6 +9282,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -8841,7 +9295,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT208x192x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB8_LBSPPA128_LPA16_LPB16_LRVW8_MIWT13_3_SS0_SVW4_VWA1_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT208x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT13_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -8903,7 +9358,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -8914,6 +9371,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 156 NumGlobalWriteVectorsPerThread: 156 @@ -8935,8 +9393,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 47 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT208x192x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB8_GSU1_LBSPPA128_LPA16_LPB16_LRVW8_MIWT13_3_SS0_SU8_SUM1_SUS1024_SVW4_VWA1_WG16_16_1_WGM0 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT208x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT13_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -8945,6 +9403,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -8957,6 +9418,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -8970,10 +9432,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -8990,6 +9454,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -9002,10 +9468,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -9017,6 +9486,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -9028,7 +9499,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT288x192x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA128_MIWT9_6_SS1_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT288x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT9_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -9090,7 +9562,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -9101,6 +9575,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 216 NumGlobalWriteVectorsPerThread: 216 @@ -9122,8 +9597,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 48 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT288x192x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA128_MIWT9_6_SS1_SU8_SUM1_SUS128_SVW1_VWA1_WG32_8_1_WGM8 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT288x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT9_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -9132,6 +9607,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -9144,6 +9622,7 @@ ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -9157,10 +9636,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -9177,6 +9658,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -9189,10 +9672,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -9204,6 +9690,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -9215,7 +9703,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_GRVWA4_LBSPPA256_MIWT6_8_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -9277,7 +9766,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -9288,6 +9779,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 96 @@ -9309,8 +9801,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 49 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_GRVWA4_GSU1_LBSPPA256_MIWT6_8_SS1_SU32_SUM1_SUS128_SVW2_VWA2_WG32_8_1_WGM0 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 StaggerUMapping: 1 @@ -9319,6 +9811,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -9331,6 +9826,7 @@ ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -9344,10 +9840,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -9364,6 +9862,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -9376,10 +9876,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -9391,6 +9894,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -9402,7 +9907,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_2_SS1_SVW4_VWA4_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -9464,7 +9970,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -9475,6 +9983,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 @@ -9496,8 +10005,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 50 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_2_SS1_SU8_SUM1_SUS256_SVW4_VWA4_WG16_16_1_WGM0 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -9506,6 +10015,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -9518,6 +10030,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -9531,10 +10044,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -9551,6 +10066,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -9563,10 +10080,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -9578,6 +10098,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -9589,7 +10111,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA1024_MIWT8_7_SS1_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -9651,7 +10174,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -9662,6 +10187,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 28 @@ -9683,8 +10209,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 51 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA1024_MIWT8_7_SS1_SU8_SUM1_SUS512_SVW8_VWA8_WG32_8_1_WGM0 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -9693,6 +10219,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -9705,6 +10234,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -9718,10 +10248,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -9738,6 +10270,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -9750,10 +10284,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -9765,6 +10302,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -9776,7 +10315,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS0_SVW8_VWA2_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -9838,7 +10378,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -9849,6 +10391,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 @@ -9870,8 +10413,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 52 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS0_SU8_SUM1_SUS512_SVW8_VWA2_WG64_4_1_WGM0 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -9880,6 +10423,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -9892,6 +10438,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -9905,10 +10452,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -9925,6 +10474,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -9937,10 +10488,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -9952,6 +10506,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -9963,7 +10519,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -10025,7 +10582,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -10036,6 +10595,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 @@ -10057,8 +10617,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 53 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SS1_SU8_SUM1_SUS256_SVW2_VWA2_WG32_8_1_WGM0 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -10067,6 +10627,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -10079,6 +10642,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -10092,10 +10656,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -10112,6 +10678,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -10124,10 +10692,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -10139,6 +10710,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -10150,7 +10723,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_6_SS0_SVW8_VWA2_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -10212,7 +10786,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -10223,6 +10799,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 48 NumGlobalWriteVectorsPerThread: 24 @@ -10244,8 +10821,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 54 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_6_SS0_SU8_SUM1_SUS1024_SVW8_VWA2_WG64_4_1_WGM8 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -10254,6 +10831,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -10266,6 +10846,7 @@ ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -10279,10 +10860,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -10299,6 +10882,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -10311,10 +10896,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -10326,6 +10914,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -10337,7 +10927,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_7_SS1_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -10399,7 +10990,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -10410,6 +11003,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 112 NumGlobalWriteVectorsPerThread: 28 @@ -10431,8 +11025,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 55 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_7_SS1_SU32_SUM1_SUS512_SVW4_VWA4_WG32_8_1_WGM0 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 StaggerUMapping: 1 @@ -10441,6 +11035,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -10453,6 +11050,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -10466,10 +11064,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -10486,6 +11086,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -10498,10 +11100,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -10513,6 +11118,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -10524,7 +11131,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_2_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -10586,7 +11194,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -10597,6 +11207,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 @@ -10618,8 +11229,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 56 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_2_SS1_SU32_SUM1_SUS512_SVW2_VWA2_WG32_8_1_WGM0 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 StaggerUMapping: 1 @@ -10628,6 +11239,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -10640,6 +11254,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -10653,10 +11268,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -10673,6 +11290,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -10685,10 +11304,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -10700,6 +11322,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -10711,7 +11335,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_2_SS0_SVW8_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -10773,7 +11398,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -10784,6 +11411,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 @@ -10805,8 +11433,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 57 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_2_SS0_SU8_SUM1_SUS1024_SVW8_VWA2_WG32_8_1_WGM0 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -10815,6 +11443,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -10827,6 +11458,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -10840,10 +11472,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -10860,6 +11494,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -10872,10 +11508,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -10887,6 +11526,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -10898,7 +11539,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_4_SS1_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -10960,7 +11602,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -10971,6 +11615,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 @@ -10992,8 +11637,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 58 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_4_SS1_SU8_SUM1_SUS256_SVW4_VWA4_WG32_8_1_WGM8 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -11002,6 +11647,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -11014,6 +11662,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -11027,10 +11676,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -11047,6 +11698,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -11059,10 +11712,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -11074,6 +11730,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -11085,7 +11743,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA1024_LBSPPB512_LPA16_LPB16_LRVW8_MIWT2_1_SS1_SVW2_VWA2_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 8 @@ -11147,7 +11806,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -11158,6 +11819,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 @@ -11179,8 +11841,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 59 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LBSPPB512_LPA16_LPB16_LRVW8_MIWT2_1_SS1_SU0_SUM0_SUS0_SVW2_VWA2_WG16_16_1_WGM8 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -11189,6 +11851,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -11201,6 +11866,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -11214,10 +11880,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 256 @@ -11234,6 +11902,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -11246,10 +11916,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -11261,6 +11934,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -11272,7 +11947,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_GRVWB4_LBSPPA1024_MIWT8_6_SS1_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -11285,9 +11961,9 @@ LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumBytes: 59136 + LdsNumBytes: 63488 LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedB: 30464 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -11296,7 +11972,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59136 + LdsOffsetMetadata: 63488 LdsOffsetMetadata_Blk: 98560 LdsPadA: 4 LdsPadB: 4 @@ -11318,14 +11994,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 6] + MIWaveTile: [8, 7] MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTileB: 7 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 192 + MacroTile1: 224 MacroTileA: 256 - MacroTileB: 192 + MacroTileB: 224 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -11334,7 +12010,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -11345,15 +12023,16 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 28 NumLoadsA: 16 - NumLoadsB: 12 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularB: 14 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -11366,203 +12045,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 60 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA1024_MIWT8_6_SS1_SU0_SUM0_SUS0_SVW8_VWA8_WG32_8_1_WGM8 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 8 - AssertSummationElementMultiple: 32 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 8 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_7_SS1_SVW8_VWA8_WG32_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 33024 - LdsNumElementsAlignedB: 30464 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33024 - LdsOffsetB_Blk: 98560 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 63488 - LdsOffsetMetadata_Blk: 98560 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: -1 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 16 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 14 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 1 - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 61 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_7_SS1_SU32_SUM1_SUS128_SVW8_VWA8_WG32_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 1 - StaggerUStride: 128 + StaggerU: 32 + StaggerUMapping: 1 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -11575,6 +12070,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -11588,10 +12084,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -11608,6 +12106,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -11620,10 +12120,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -11635,6 +12138,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -11646,7 +12151,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA512_LBSPPB128_LPA16_LPB16_LRVW8_MIWT4_3_SS1_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -11708,7 +12214,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -11719,6 +12227,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 48 NumGlobalWriteVectorsPerThread: 12 @@ -11740,8 +12249,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 62 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB128_LPA16_LPB16_LRVW8_MIWT4_3_SS1_SU32_SUM1_SUS1024_SVW4_VWA4_WG32_8_1_WGM0 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 StaggerUMapping: 1 @@ -11750,6 +12259,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -11762,6 +12274,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -11775,10 +12288,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -11795,6 +12310,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -11807,10 +12324,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -11822,6 +12342,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -11833,7 +12355,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA512_LBSPPB128_LPA16_LPB16_LRVW8_MIWT4_3_SS1_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -11895,7 +12418,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -11906,6 +12431,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 48 NumGlobalWriteVectorsPerThread: 12 @@ -11927,8 +12453,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 63 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB128_LPA16_LPB16_LRVW8_MIWT4_3_SS1_SU8_SUM1_SUS256_SVW4_VWA4_WG32_8_1_WGM8 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -11937,6 +12463,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -11949,6 +12478,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -11962,10 +12492,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -11982,6 +12514,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -11994,10 +12528,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -12009,6 +12546,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -12020,7 +12559,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA256_LPA16_LPB16_LRVW8_MIWT6_5_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -12082,7 +12622,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -12093,6 +12635,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 120 NumGlobalWriteVectorsPerThread: 60 @@ -12114,8 +12657,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 64 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA256_LPA16_LPB16_LRVW8_MIWT6_5_SS1_SU8_SUM1_SUS256_SVW2_VWA2_WG32_8_1_WGM0 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -12124,6 +12667,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -12136,6 +12682,7 @@ ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -12149,10 +12696,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -12169,6 +12718,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -12181,10 +12732,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -12196,6 +12750,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -12207,7 +12763,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_1_SS0_SVW4_VWA1_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -12269,7 +12826,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -12280,6 +12839,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -12301,8 +12861,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 65 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_1_SS0_SU8_SUM1_SUS1024_SVW4_VWA1_WG64_4_1_WGM0 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -12311,8 +12871,11 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false @@ -12323,6 +12886,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -12336,10 +12900,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -12356,6 +12922,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -12368,10 +12936,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -12383,6 +12954,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -12394,7 +12967,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA128_MIWT7_8_SS1_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -12456,7 +13030,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -12467,6 +13043,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 224 @@ -12488,8 +13065,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 66 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA128_MIWT7_8_SS1_SU8_SUM1_SUS512_SVW1_VWA1_WG32_8_1_WGM8 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT224x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -12498,6 +13075,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -12510,6 +13090,7 @@ ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -12523,10 +13104,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -12543,6 +13126,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -12555,10 +13140,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -12570,6 +13158,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -12581,7 +13171,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_4_SS1_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -12643,7 +13234,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -12654,6 +13247,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 @@ -12675,8 +13269,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 67 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_4_SS1_SU32_SUM1_SUS256_SVW4_VWA4_WG32_8_1_WGM8 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 StaggerUMapping: 1 @@ -12685,6 +13279,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -12697,6 +13294,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -12710,10 +13308,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -12730,6 +13330,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -12742,10 +13344,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -12757,6 +13362,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -12768,7 +13375,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_4_SS1_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -12830,7 +13438,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -12841,6 +13451,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 @@ -12862,8 +13473,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 68 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA512_LPA16_LPB16_LRVW8_MIWT4_4_SS1_SU8_SUM1_SUS512_SVW4_VWA4_WG32_8_1_WGM8 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -12872,6 +13483,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -12884,6 +13498,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -12897,10 +13512,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -12917,6 +13534,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -12929,10 +13548,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -12944,6 +13566,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -12955,7 +13579,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA1024_LPA16_LPB16_LRVW8_MIWT8_4_SS1_SVW8_VWA8_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -13017,7 +13642,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -13028,6 +13655,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 16 @@ -13049,8 +13677,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 69 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA1024_LPA16_LPB16_LRVW8_MIWT8_4_SS1_SU8_SUM1_SUS512_SVW8_VWA8_WG16_16_1_WGM0 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -13059,6 +13687,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -13071,6 +13702,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -13084,10 +13716,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -13104,6 +13738,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -13116,10 +13752,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -13131,6 +13770,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -13142,7 +13783,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA1024_MIWT8_7_SS1_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -13204,7 +13846,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -13215,6 +13859,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 28 @@ -13236,8 +13881,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 70 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA1024_MIWT8_7_SS1_SU8_SUM1_SUS128_SVW8_VWA8_WG32_8_1_WGM8 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS128_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -13246,6 +13891,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -13258,6 +13906,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -13271,10 +13920,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -13291,6 +13942,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -13303,10 +13956,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -13318,6 +13974,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -13329,7 +13987,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA256_LPA16_LPB16_LRVW8_MIWT6_3_SS1_SVW2_VWA2_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -13391,7 +14050,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -13402,6 +14063,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 72 NumGlobalWriteVectorsPerThread: 36 @@ -13423,8 +14085,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 71 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA256_LPA16_LPB16_LRVW8_MIWT6_3_SS1_SU8_SUM1_SUS1024_SVW2_VWA2_WG16_16_1_WGM0 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -13433,6 +14095,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -13445,6 +14110,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -13458,10 +14124,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -13478,6 +14146,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -13490,10 +14160,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -13505,6 +14178,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -13516,7 +14191,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA256_LPA16_LPB16_LRVW8_MIWT10_2_SS1_SVW2_VWA2_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -13578,7 +14254,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -13589,6 +14267,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 80 NumGlobalWriteVectorsPerThread: 40 @@ -13610,8 +14289,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 72 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA256_LPA16_LPB16_LRVW8_MIWT10_2_SS1_SU0_SUM0_SUS0_SVW2_VWA2_WG16_16_1_WGM0 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT10_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -13620,6 +14299,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -13632,6 +14314,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -13645,10 +14328,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -13665,6 +14350,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -13677,10 +14364,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -13692,6 +14382,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -13703,7 +14395,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_LBSPPA256_LPA16_LPB16_LRVW8_MIWT6_6_SS0_SVW8_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -13765,7 +14458,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -13776,6 +14471,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 144 NumGlobalWriteVectorsPerThread: 72 @@ -13797,8 +14493,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 73 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA256_LPA16_LPB16_LRVW8_MIWT6_6_SS0_SU0_SUM0_SUS0_SVW8_VWA2_WG32_8_1_WGM8 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 StaggerUMapping: 0 @@ -13807,6 +14503,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -13819,6 +14518,7 @@ ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -13832,10 +14532,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -13852,6 +14554,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -13864,10 +14568,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -13879,6 +14586,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -13890,7 +14599,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIWT1_1_SS1_SVW1_VWA1_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 4 @@ -13952,7 +14662,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -13963,6 +14675,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -13984,8 +14697,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 74 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIWT1_1_SS1_SU0_SUM0_SUS0_SVW1_VWA1_WG32_4_1_WGM0 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -13994,6 +14707,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -14006,6 +14722,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14019,10 +14736,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 256 @@ -14039,6 +14758,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -14051,10 +14772,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -14066,6 +14790,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -14077,7 +14803,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_1_SS1_SVW1_VWA1_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 8 @@ -14139,7 +14866,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -14150,6 +14879,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -14171,8 +14901,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 75 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_1_SS1_SU8_SUM1_SUS256_SVW1_VWA1_WG32_4_1_WGM0 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -14181,6 +14911,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -14193,6 +14926,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14206,10 +14940,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -14226,6 +14962,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -14238,10 +14976,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -14253,6 +14994,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -14264,7 +15007,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_1_SS1_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -14326,7 +15070,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -14337,6 +15083,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -14358,8 +15105,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 76 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_1_SS1_SU8_SUM1_SUS256_SVW1_VWA1_WG32_8_1_WGM0 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -14368,6 +15115,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -14380,6 +15130,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14393,10 +15144,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -14413,6 +15166,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -14425,10 +15180,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -14440,6 +15198,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -14451,7 +15211,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_SS1_SVW2_VWA2_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -14513,7 +15274,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -14524,6 +15287,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 @@ -14545,8 +15309,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 77 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_SS1_SU8_SUM1_SUS256_SVW2_VWA2_WG16_16_1_WGM0 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -14555,6 +15319,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -14567,6 +15334,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14580,10 +15348,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -14600,6 +15370,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -14612,10 +15384,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -14627,6 +15402,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -14638,7 +15415,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -14700,7 +15478,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -14711,6 +15491,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 @@ -14732,8 +15513,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 78 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_SS1_SU8_SUM1_SUS256_SVW2_VWA2_WG32_8_1_WGM0 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -14742,6 +15523,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -14754,6 +15538,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14767,10 +15552,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -14787,6 +15574,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -14799,10 +15588,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -14814,6 +15606,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -14825,7 +15619,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_3_SS1_SVW1_VWA1_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -14887,7 +15682,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -14898,6 +15695,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 12 NumGlobalWriteVectorsPerThread: 12 @@ -14919,8 +15717,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 79 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_3_SS1_SU8_SUM1_SUS512_SVW1_VWA1_WG64_4_1_WGM0 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -14929,8 +15727,11 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 - SubGroup0: 16 - SubGroup1: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false @@ -14941,6 +15742,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14954,10 +15756,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -14974,6 +15778,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -14986,10 +15792,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -15001,6 +15810,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -15012,7 +15823,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_LBSPPA256_MIWT6_7_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -15074,7 +15886,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -15085,6 +15899,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 168 NumGlobalWriteVectorsPerThread: 84 @@ -15106,8 +15921,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 80 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_GSU1_LBSPPA256_MIWT6_7_SS1_SU32_SUM1_SUS256_SVW2_VWA2_WG32_8_1_WGM8 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM1_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 StaggerUMapping: 1 @@ -15116,6 +15931,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -15128,6 +15946,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -15141,10 +15960,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -15161,6 +15982,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 @@ -15173,10 +15996,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -15188,6 +16014,8 @@ GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -15199,7 +16027,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_LBSPPA256_MIWT6_7_SS1_SVW2_VWA2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -15261,7 +16090,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -15272,6 +16103,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 168 NumGlobalWriteVectorsPerThread: 84 @@ -15293,8 +16125,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 81 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_GRVWA4_GRVWB4_GSU1_LBSPPA256_MIWT6_7_SS1_SU8_SUM1_SUS1024_SVW2_VWA2_WG32_8_1_WGM8 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM8_AFEM8_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 1 @@ -15303,6 +16135,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -15315,6 +16150,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -15328,10 +16164,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -15348,6 +16186,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 32 @@ -15360,10 +16200,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -15375,6 +16218,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -15386,7 +16231,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_1_SS0_SVW4_VWA1_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM1_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 8 @@ -15448,7 +16294,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -15459,6 +16307,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -15480,8 +16329,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 82 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_GRVWA8_GRVWB8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT1_1_SS0_SU8_SUM1_SUS1024_SVW4_VWA1_WG32_4_1_WGM0 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM8_AFEM1_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM1_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 8 StaggerUMapping: 1 @@ -15490,6 +16339,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -15502,6 +16354,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -15515,10 +16368,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -15535,6 +16390,8 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -15547,10 +16404,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -15562,6 +16422,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -15573,7 +16435,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT80x256x64_MI16x16x1_SN_MIWT5_4_NEPBS0 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -15635,7 +16498,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -15646,6 +16511,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 80 NumGlobalWriteVectorsPerThread: 80 @@ -15667,8 +16533,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 83 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT80x256x64_MI16x16x1_SN_GSU1_MIWT5_4_NEPBS0_SU32_SUM0_SUS256_WGM32 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 32 StaggerUMapping: 0 @@ -15677,6 +16543,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -15689,6 +16558,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -15702,10 +16572,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 32 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -15722,6 +16594,8 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -15734,10 +16608,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -15749,6 +16626,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -15760,7 +16639,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_MIWT2_6_NEPBS0 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -15822,7 +16702,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -15833,6 +16715,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 48 NumGlobalWriteVectorsPerThread: 24 @@ -15854,8 +16737,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 84 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_GSU1_MIWT2_6_NEPBS0_SU4_SUM0_SUS512_WGM0 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM0_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 4 StaggerUMapping: 0 @@ -15864,6 +16747,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -15876,6 +16762,7 @@ ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -15889,10 +16776,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 0 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -15909,6 +16798,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -15921,10 +16812,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -15936,6 +16830,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -15947,7 +16843,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_GRVWB8_LBSPPA1024_LBSPPB512_LPA16_LPB16_LRVW8_MIWT2_1_NTC0_NTD0_SS1_SPO1_SVW2_VWA2_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 8 @@ -16009,7 +16906,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -16020,6 +16919,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 @@ -16041,8 +16941,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 85 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_GRVWB8_GSU1_LBSPPA1024_LBSPPB512_LPA16_LPB16_LRVW8_MIWT2_1_NTC0_NTD0_SS1_SU0_SUM0_SUS0_SPO1_SVW2_VWA2_WG16_4_4_WGM1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -16051,6 +16951,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -16063,6 +16966,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -16076,10 +16980,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 256 @@ -16096,6 +17002,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -16108,10 +17016,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -16123,6 +17034,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -16134,7 +17047,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_GRVWB8_LBSPPA1024_LBSPPB512_LPA16_LPB16_LRVW8_MIWT2_1_NTC0_NTD0_SS1_SPO0_SVW2_VWA2_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 8 @@ -16196,7 +17110,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -16207,6 +17123,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 @@ -16228,8 +17145,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 86 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_GRVWB8_GSU1_LBSPPA1024_LBSPPB512_LPA16_LPB16_LRVW8_MIWT2_1_NTC0_NTD0_SS1_SU0_SUM0_SUS0_SPO0_SVW2_VWA2_WG16_4_4_WGM1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -16238,6 +17155,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -16250,6 +17170,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -16263,10 +17184,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 256 @@ -16283,6 +17206,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -16295,10 +17220,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -16310,6 +17238,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -16321,7 +17251,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_GRVWB8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_NTC0_NTD0_SS1_SPO1_SVW1_VWA1_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -16383,7 +17314,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -16394,6 +17327,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -16415,8 +17349,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 87 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_GRVWB8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_NTC0_NTD0_SS1_SU0_SUM0_SUS0_SPO1_SVW1_VWA1_WG32_4_2_WGM1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -16425,6 +17359,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -16437,6 +17374,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -16450,10 +17388,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -16470,6 +17410,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -16482,10 +17424,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -16497,6 +17442,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -16508,7 +17455,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_NTC3_NTD3_SS1_SPO1_SVW2_VWA2_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -16570,7 +17518,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -16581,6 +17531,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 @@ -16602,8 +17553,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 88 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_NTC3_NTD3_SS1_SU0_SUM0_SUS0_SPO1_SVW2_VWA2_WG32_4_2_WGM1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -16612,6 +17563,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -16624,6 +17578,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -16637,10 +17592,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -16657,6 +17614,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -16669,10 +17628,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -16684,6 +17646,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -16695,7 +17659,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x16x128_MI16x16x1_SN_GRVWB8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_NTC3_NTD3_SS0_SPO1_SVW8_VWA2_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -16757,7 +17722,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -16768,6 +17735,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 @@ -16789,8 +17757,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 89 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x16x128_MI16x16x1_SN_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_1_NTC3_NTD3_SS0_SU0_SUM0_SUS0_SPO1_SVW8_VWA2_WG64_4_1_WGM1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 StaggerUMapping: 0 @@ -16799,6 +17767,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -16811,6 +17782,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -16824,10 +17796,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -16844,6 +17818,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -16856,10 +17832,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -16871,6 +17850,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -16882,7 +17863,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LBSPPA2048_LBSPPB1024_MIWT2_1_NTC3_NTD3_SS1_SPO0_SVW2_VWA2_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 512 LSCB: 512 LSPA: 4 @@ -16944,7 +17926,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -16955,6 +17939,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 @@ -16976,8 +17961,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 90 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_GSU1_LBSPPA2048_LBSPPB1024_MIWT2_1_NTC3_NTD3_SS1_SU0_SUM0_SUS0_SPO0_SVW2_VWA2_WG16_4_4_WGM1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -16986,6 +17971,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -16998,6 +17986,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -17011,10 +18000,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 512 @@ -17031,6 +18022,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -17043,10 +18036,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -17058,6 +18054,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -17069,7 +18067,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x32x128_MI16x16x1_SN_GRVWA8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT3_1_NTC0_NTD0_NLCA1_SS1_SPO0_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -17131,7 +18130,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -17142,6 +18143,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 12 NumGlobalWriteVectorsPerThread: 12 @@ -17163,8 +18165,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 91 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x32x128_MI16x16x1_SN_GRVWA8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT3_1_NTC0_NTD0_NLCA1_SS1_SU0_SUM0_SUS0_SPO0_SVW1_VWA1_WG32_8_1_WGM1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -17173,6 +18175,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -17185,6 +18190,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -17198,10 +18204,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -17218,6 +18226,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -17230,10 +18240,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -17245,6 +18258,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -17256,7 +18271,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x32x128_MI16x16x1_SN_CLR1_GRVWA8_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_1_NTC3_NTD3_NLCA1_PLR1_SS1_SPO0_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -17318,7 +18334,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -17329,6 +18347,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 @@ -17350,8 +18369,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 92 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x32x128_MI16x16x1_SN_CLR1_GRVWA8_GSU1_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_1_NTC3_NTD3_NLCA1_PLR1_SS1_SU0_SUM0_SUS0_SPO0_SVW4_VWA4_WG32_8_1_WGM1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -17360,6 +18379,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -17372,6 +18394,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -17385,10 +18408,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -17405,6 +18430,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -17417,10 +18444,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -17432,6 +18462,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -17443,7 +18475,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_CLR1_GRVWA8_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_2_NTC0_NTD0_NLCA1_PLR1_SS1_SPO1_SVW2_VWA2_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -17505,7 +18538,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -17516,6 +18551,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 @@ -17537,8 +18573,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 93 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_CLR1_GRVWA8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_2_NTC0_NTD0_NLCA1_PLR1_SS1_SU0_SUM0_SUS0_SPO1_SVW2_VWA2_WG16_4_4_WGM1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -17547,8 +18583,11 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 4 - SubGroup1: 16 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false @@ -17559,6 +18598,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -17572,10 +18612,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -17592,6 +18634,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -17604,10 +18648,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -17619,6 +18666,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -17630,7 +18679,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x256_MI32x32x1_SN_LBSPPA512_LBSPPB512_LPA8_LPB8_LRVW8_MIWT1_1_NTC3_NTD3_SS1_SPO0_SVW1_VWA1_WG32_2_4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x256_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 8 @@ -17692,7 +18742,9 @@ MatrixInstM: 32 MatrixInstN: 32 MatrixInstruction: [32, 32, 8, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -17703,6 +18755,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 @@ -17724,8 +18777,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 94 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x256_MI32x32x1_SN_GSU1_LBSPPA512_LBSPPB512_LPA8_LPB8_LRVW8_MIWT1_1_NTC3_NTD3_SS1_SU0_SUM0_SUS0_SPO0_SVW1_VWA1_WG32_2_4_WGM1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x256_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -17734,6 +18787,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -17746,6 +18802,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -17759,10 +18816,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 256 @@ -17779,6 +18838,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -17791,10 +18852,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -17806,6 +18870,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -17817,7 +18883,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x64x128_MI16x16x1_SN_GRVWA8_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT3_2_NTC0_NTD0_NLCA1_SS1_SPO0_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -17879,7 +18946,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -17890,6 +18959,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 24 @@ -17911,8 +18981,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 95 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x64x128_MI16x16x1_SN_GRVWA8_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT3_2_NTC0_NTD0_NLCA1_SS1_SU0_SUM0_SUS0_SPO0_SVW1_VWA1_WG32_8_1_WGM1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -17921,6 +18991,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -17933,6 +19006,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -17946,10 +19020,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -17966,6 +19042,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -17978,10 +19056,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -17993,6 +19074,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -18004,7 +19087,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_CLR1_GRVWA8_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_2_NTC3_NTD3_NLCA1_PLR1_SS1_SPO0_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -18066,7 +19150,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -18077,6 +19163,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 @@ -18098,8 +19185,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 96 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_CLR1_GRVWA8_GSU1_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_2_NTC3_NTD3_NLCA1_PLR1_SS1_SU0_SUM0_SUS0_SPO0_SVW4_VWA4_WG32_8_1_WGM1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -18108,6 +19195,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -18120,6 +19210,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -18133,10 +19224,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -18153,6 +19246,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -18165,10 +19260,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -18180,6 +19278,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -18191,7 +19291,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_CLR1_GRVWA8_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_4_NTC3_NTD3_NLCA1_PLR1_SS1_SPO0_SVW4_VWA4_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -18253,7 +19354,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -18264,6 +19367,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 @@ -18285,8 +19389,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 97 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_CLR1_GRVWA8_GSU1_LBSPPA1024_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_4_NTC3_NTD3_NLCA1_PLR1_SS1_SU0_SUM0_SUS0_SPO0_SVW4_VWA4_WG32_4_2_WGM1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -18295,6 +19399,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -18307,6 +19414,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -18320,10 +19428,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -18340,6 +19450,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -18352,10 +19464,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -18367,6 +19482,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -18378,7 +19495,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x256_MI16x16x1_SN_LBSPPA2048_LBSPPB512_LPA16_LPB16_LRVW8_MIWT4_2_NTC3_NTD3_SPO0_SVW4_VWA4_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 256 LSCB: 256 LSPA: 8 @@ -18440,7 +19558,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -18451,6 +19571,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 @@ -18472,8 +19593,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 98 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x256_MI16x16x1_SN_GSU1_LBSPPA2048_LBSPPB512_LPA16_LPB16_LRVW8_MIWT4_2_NTC3_NTD3_SU0_SUM0_SUS0_SPO0_SVW4_VWA4_WG16_4_4_WGM1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -18482,6 +19603,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -18494,6 +19618,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -18507,10 +19632,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 256 @@ -18527,6 +19654,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -18539,10 +19668,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -18554,6 +19686,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -18565,7 +19699,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x128x128_MI16x16x1_SN_CLR1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIAV1_MIWT3_4_NTC0_NTD0_PLR1_SS1_SPO0_SVW1_VWA1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 128 LSCB: 128 LSPA: 16 @@ -18627,7 +19762,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -18638,6 +19775,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 48 NumGlobalWriteVectorsPerThread: 48 @@ -18659,8 +19797,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 99 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x128x128_MI16x16x1_SN_CLR1_GSU1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIAV1_MIWT3_4_NTC0_NTD0_PLR1_SS1_SU0_SUM0_SUS0_SPO0_SVW1_VWA1_WG32_8_1_WGM1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -18669,6 +19807,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -18681,6 +19822,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -18694,10 +19836,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 128 @@ -18714,6 +19858,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -18726,10 +19872,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -18741,6 +19890,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -18752,7 +19903,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI32x32x1_SN_CLR1_LBSPPA256_LBSPPB128_LPA8_LPB8_LRVW8_MIAV1_MIWT2_2_NTC3_NTD3_PLR1_SS1_SPO0_SVW2_VWA2_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -18814,7 +19966,9 @@ MatrixInstM: 32 MatrixInstN: 32 MatrixInstruction: [32, 32, 8, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -18825,6 +19979,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 32 @@ -18846,8 +20001,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 100 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI32x32x1_SN_CLR1_GSU1_LBSPPA256_LBSPPB128_LPA8_LPB8_LRVW8_MIAV1_MIWT2_2_NTC3_NTD3_PLR1_SS1_SU0_SUM0_SUS0_SPO0_SVW2_VWA2_WG64_4_1_WGM1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -18856,6 +20011,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -18868,6 +20026,7 @@ ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -18881,10 +20040,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -18901,6 +20062,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -18913,10 +20076,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -18928,6 +20094,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -18939,7 +20107,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB128_LPA16_LPB16_LRVW8_MIAV1_MIWT4_4_NTC3_NTD3_PLR1_SS1_SPO0_SVW4_VWA4_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -19001,7 +20170,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -19012,6 +20183,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 @@ -19033,8 +20205,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 101 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB128_LPA16_LPB16_LRVW8_MIAV1_MIWT4_4_NTC3_NTD3_PLR1_SS1_SU0_SUM0_SUS0_SPO0_SVW4_VWA4_WG32_8_1_WGM1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -19043,6 +20215,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -19055,6 +20230,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -19068,10 +20244,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -19088,6 +20266,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -19100,10 +20280,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -19115,6 +20298,8 @@ GlobalReadVectorWidthB: 2 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -19126,7 +20311,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_7_NEPBS16_SSO0_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 8 @@ -19188,7 +20374,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -19199,6 +20387,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 28 @@ -19220,8 +20409,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 102 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_GSU1_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_7_NEPBS16_SU8_SUM0_SUS256_SSO0_SVW8_VWA8_WG32_8_1_WGM16 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 0 @@ -19230,6 +20419,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -19242,6 +20434,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -19255,10 +20448,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 16 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -19275,6 +20470,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -19287,10 +20484,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -19302,6 +20502,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -19313,7 +20515,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x512_MI16x16x1_SN_GRVWB8_LBSPPA1024_LBSPPB1024_MIWT1_1_NTC0_NTD0_SS1_SVW1_VWA1_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 512 LSCB: 512 LSPA: 4 @@ -19375,7 +20578,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -19386,6 +20591,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 @@ -19407,8 +20613,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 103 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x512_MI16x16x1_SN_GRVWB8_GSU1_LBSPPA1024_LBSPPB1024_MIWT1_1_NTC0_NTD0_SS1_SU0_SUM0_SUS0_SVW1_VWA1_WG16_4_4_WGM1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -19417,6 +20623,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -19429,6 +20638,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -19442,10 +20652,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 512 @@ -19462,6 +20674,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -19474,10 +20688,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 512 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -19489,6 +20706,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -19500,7 +20719,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x512_MI16x16x1_SN_GRVWB8_LBSPPA1024_LBSPPB1024_MIWT1_1_NTC0_NTD0_SS0_SVW1_VWA1_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false LSCA: 512 LSCB: 512 LSPA: 4 @@ -19562,7 +20782,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -19573,6 +20795,7 @@ NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 @@ -19594,8 +20817,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 104 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x512_MI16x16x1_SN_GRVWB8_GSU1_LBSPPA1024_LBSPPB1024_MIWT1_1_NTC0_NTD0_SS0_SU0_SUM0_SUS0_SVW1_VWA1_WG16_4_4_WGM1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 0 StaggerUMapping: 0 @@ -19604,6 +20827,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -19616,6 +20842,7 @@ ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -19629,10 +20856,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 512 @@ -19649,6 +20878,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -19661,10 +20892,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -19676,6 +20910,8 @@ GlobalReadVectorWidthB: 2 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -19687,7 +20923,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x176x64_MI16x16x1_SN_GRVWA2_GRVWB2_LBSPPA512_LPA4_LPB4_LRVW4_MIWT4_11_NEPBS0_SVW4_VWA4_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x176x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 8 @@ -19749,7 +20986,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -19760,6 +20999,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 176 NumGlobalWriteVectorsPerThread: 44 @@ -19781,8 +21021,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 105 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x176x64_MI16x16x1_SN_GRVWA2_GRVWB2_GSU1_LBSPPA512_LPA4_LPB4_LRVW4_MIWT4_11_NEPBS0_SU8_SUM0_SUS256_SVW4_VWA4_WG64_4_1_WGM32 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x176x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 0 @@ -19791,6 +21031,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 4 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -19803,6 +21046,7 @@ ThreadTileB: 11 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -19816,10 +21060,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 32 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -19836,6 +21082,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -19848,10 +21096,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -19863,6 +21114,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -19874,7 +21127,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_GRVWA8_GRVWB8_LBSPPA512_LPA8_LPB8_LRVW8_MIWT4_3_NEPBS0_SVW4_VWA4_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -19936,7 +21190,9 @@ MatrixInstM: 32 MatrixInstN: 32 MatrixInstruction: [32, 32, 8, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -19947,6 +21203,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 48 @@ -19968,8 +21225,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 106 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_GRVWA8_GRVWB8_GSU1_LBSPPA512_LPA8_LPB8_LRVW8_MIWT4_3_NEPBS0_SU8_SUM0_SUS256_SVW4_VWA4_WG64_4_1_WGM32 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 0 @@ -19978,6 +21235,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 4 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -19990,6 +21250,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -20003,10 +21264,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 32 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -20023,6 +21286,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -20035,10 +21300,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -20050,6 +21318,8 @@ GlobalReadVectorWidthB: 2 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -20061,7 +21331,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_6_NEPBS16_SSO0_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 8 @@ -20123,7 +21394,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -20134,6 +21407,7 @@ NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 24 @@ -20155,8 +21429,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 107 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_GSU1_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_6_NEPBS16_SU8_SUM0_SUS256_SSO0_SVW8_VWA8_WG32_8_1_WGM16 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 0 @@ -20165,8 +21439,11 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 32 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false @@ -20177,6 +21454,7 @@ ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -20190,10 +21468,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 16 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -20210,6 +21490,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -20222,10 +21504,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -20237,6 +21522,8 @@ GlobalReadVectorWidthB: 2 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -20248,7 +21535,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_GRVWA2_GRVWB2_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_7_SVW8_VWA8_WG32_8_1_WGMXCC4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 8 @@ -20310,7 +21598,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -20321,6 +21611,7 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 28 @@ -20342,8 +21633,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 108 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_GRVWA2_GRVWB2_GSU1_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_7_SU8_SUM0_SUS256_SVW8_VWA8_WG32_8_1_WGM1_WGMXCC4 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 0 @@ -20352,6 +21643,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 4 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -20364,6 +21658,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -20377,10 +21672,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -20397,6 +21694,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -20409,10 +21708,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -20424,6 +21726,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -20435,7 +21739,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_GRVWA8_GRVWB8_LBSPPA512_LPA8_LPB8_LRVW8_MIWT4_3_SVW4_VWA4_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -20497,7 +21802,9 @@ MatrixInstM: 32 MatrixInstN: 32 MatrixInstruction: [32, 32, 8, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -20508,6 +21815,7 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 192 NumGlobalWriteVectorsPerThread: 48 @@ -20529,8 +21837,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 109 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_GRVWA8_GRVWB8_GSU1_LBSPPA512_LPA8_LPB8_LRVW8_MIWT4_3_SU8_SUM0_SUS256_SVW4_VWA4_WG64_4_1_WGM16 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 0 @@ -20539,6 +21847,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 4 StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -20551,6 +21862,7 @@ ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -20564,10 +21876,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 16 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -20584,6 +21898,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -20596,10 +21912,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -20611,6 +21930,8 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -20622,7 +21943,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_GRVWA2_GRVWB8_LBSPPA512_LPA4_LPB8_LRVW4_MIAV1_MIWT8_4_NTC4_NTD4_SPO0_SSO0_SVW8_VWA8_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 16 @@ -20684,7 +22006,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -20695,6 +22019,7 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 16 @@ -20716,8 +22041,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 110 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_LBSPPA512_LPA4_LPB8_LRVW4_MIAV1_MIWT8_4_NTC4_NTD4_SU0_SUM0_SUS0_SPO0_SSO0_SVW8_VWA8_WG16_16_1_WGM1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -20726,6 +22051,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -20738,6 +22066,7 @@ ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -20751,10 +22080,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 32 @@ -20771,6 +22102,8 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertSummationElementMultiple: 1 @@ -20783,10 +22116,13 @@ CodeObjectVersion: default ConvertAfterDS: false CustomKernelName: '' + DebugStreamK: 0 DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: false @@ -20798,6 +22134,8 @@ GlobalReadVectorWidthB: 2 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 8 GroupLoadStore: false GuaranteeNoPartialA: true @@ -20809,7 +22147,8 @@ InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseUniversalArgs: true} KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_GRVWA2_GRVWB2_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_7_SVW8_VWA8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSUAMB_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 8 @@ -20871,7 +22210,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -20882,6 +22223,7 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 224 NumGlobalWriteVectorsPerThread: 28 @@ -20903,8 +22245,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 111 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_GRVWA2_GRVWB2_GSU1_LBSPPA1024_LPA4_LPB4_LRVW4_MIWT8_7_SU8_SUM0_SUS256_SVW8_VWA8_WG32_8_1_WGM16 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 0 @@ -20913,6 +22255,9 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 4 StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -20925,6 +22270,7 @@ ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -20938,10 +22284,12 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 16 WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] _DepthU: 64 @@ -21004,7 +22352,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x320x32_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA512_LBSPPB128_LPA4_LPB8_LRVW4_MIWT8_5_SVW8_VWA8_VWB1_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x320x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 16 @@ -21067,7 +22416,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -21100,8 +22451,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 112 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x320x32_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB128_LPA4_LPB8_LRVW4_MIWT8_5_SU8_SUM0_SUS256_SVW8_VWA8_VWB1_WG16_16_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x320x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 0 @@ -21140,6 +22491,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 16 @@ -21207,7 +22559,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_GRVWA8_GRVWB8_K1_LBSPPA512_LBSPPB128_LPA8_LPB8_LRVW8_MIWT4_3_SVW4_VWA4_VWB1_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -21270,7 +22623,9 @@ MatrixInstM: 32 MatrixInstN: 32 MatrixInstruction: [32, 32, 8, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -21303,8 +22658,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 113 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB128_LPA8_LPB8_LRVW8_MIWT4_3_SU8_SUM0_SUS256_SVW4_VWA4_VWB1_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 0 @@ -21343,6 +22698,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 16 @@ -21410,7 +22766,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA512_LBSPPB256_LPA4_LPB8_LRVW4_MIWT8_4_SVW8_VWA8_VWB4_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 16 @@ -21473,7 +22830,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -21506,8 +22865,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 114 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB256_LPA4_LPB8_LRVW4_MIWT8_4_SU8_SUM0_SUS256_SVW8_VWA8_VWB4_WG16_16_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 StaggerU: 8 StaggerUMapping: 0 @@ -21546,6 +22905,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 16 @@ -21613,7 +22973,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT192x256x64_MI32x32x1_SN_GRVWA8_GRVWB8_K1_LBSPPA256_LBSPPB256_LPA8_LPB8_LRVW8_MIWT6_2_SVW2_VWA2_VWB2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -21676,7 +23037,9 @@ MatrixInstM: 32 MatrixInstN: 32 MatrixInstruction: [32, 32, 8, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -21709,8 +23072,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 115 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT192x256x64_MI32x32x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB256_LPA8_LPB8_LRVW8_MIWT6_2_SU0_SUM0_SUS0_SVW2_VWA2_VWB2_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -21749,6 +23112,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 @@ -21817,7 +23181,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT96x128x32_MI32x32x1_SN_GRVWA4_GRVWB8_K1_LBSPPA128_LBSPPB128_LPA8_LPB8_LRVW8_MIWT3_1_SVW1_VWA1_VWB1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 32 @@ -21880,7 +23245,9 @@ MatrixInstM: 32 MatrixInstN: 32 MatrixInstruction: [32, 32, 8, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -21913,8 +23280,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 116 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT96x128x32_MI32x32x1_SN_GRVWA4_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA128_LBSPPB128_LPA8_LPB8_LRVW8_MIWT3_1_SU0_SUM0_SUS0_SVW1_VWA1_VWB1_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -21953,6 +23320,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 @@ -22021,7 +23389,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT64x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_2_SVW4_VWA4_VWB2_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -22084,7 +23453,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -22117,8 +23488,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 117 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT64x128x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT4_2_SU0_SUM0_SUS0_SVW4_VWA4_VWB2_WG16_16_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -22157,6 +23528,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 @@ -22225,7 +23597,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT128x96x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA512_LBSPPB128_LPA16_LPB16_LRVW8_MIWT4_3_SVW4_VWA4_VWB1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -22288,7 +23661,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -22321,8 +23696,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 118 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT128x96x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU7_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB128_LPA16_LPB16_LRVW8_MIWT4_3_SU0_SUM0_SUS0_SVW4_VWA4_VWB1_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU7_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -22361,6 +23736,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 @@ -22429,7 +23805,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT32x16x32_MI16x16x1_SN_GRVWA4_GRVWB4_K1_LBSPPA128_LBSPPB128_LPA4_LPB4_LRVW4_MIWT1_1_SVW1_VWA1_VWB1_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 16 @@ -22492,7 +23869,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -22525,8 +23904,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 119 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT32x16x32_MI16x16x1_SN_GRVWA4_GRVWB4_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA128_LBSPPB128_LPA4_LPB4_LRVW4_MIWT1_1_SU0_SUM0_SUS0_SVW1_VWA1_VWB1_WG32_4_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -22565,6 +23944,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 1] WorkGroupMapping: 1 @@ -22633,7 +24013,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT64x64x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_2_SVW2_VWA2_VWB2_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -22696,7 +24077,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -22729,8 +24112,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 120 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT64x64x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_2_SU0_SUM0_SUS0_SVW2_VWA2_VWB2_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -22769,6 +24152,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 @@ -22837,7 +24221,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT128x128x32_MI32x32x1_SN_GRVWA8_GRVWB8_K1_LBSPPA128_LBSPPB128_LPA8_LPB8_LRVW8_MIWT1_2_SVW1_VWA1_VWB2_WG128_4_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_4_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 128 @@ -22900,7 +24285,9 @@ MatrixInstM: 32 MatrixInstN: 32 MatrixInstruction: [32, 32, 8, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -22933,8 +24320,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 121 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT128x128x32_MI32x32x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA128_LBSPPB128_LPA8_LPB8_LRVW8_MIWT1_2_SU0_SUM0_SUS0_SVW1_VWA1_VWB2_WG128_4_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_4_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -22973,6 +24360,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [128, 4, 1] WorkGroupMapping: 1 @@ -23041,7 +24429,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT192x256x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIWT12_4_SVW4_VWA4_VWB4_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -23104,7 +24493,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -23137,8 +24528,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 122 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT192x256x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB512_LPA16_LPB16_LRVW8_MIWT12_4_SU4_SUM0_SUS256_SVW4_VWA4_VWB4_WG16_16_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT12_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 4 StaggerUMapping: 0 @@ -23177,6 +24568,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 @@ -23245,7 +24637,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT192x192x32_MI16x16x1_SN_GRVWA4_GRVWB4_K1_LBSPPA256_LBSPPB128_LPA4_LPB4_LRVW4_MIWT12_3_SVW4_VWA4_VWB1_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 32 @@ -23308,7 +24701,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -23341,8 +24736,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 123 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT192x192x32_MI16x16x1_SN_GRVWA4_GRVWB4_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB128_LPA4_LPB4_LRVW4_MIWT12_3_SU4_SUM0_SUS256_SVW4_VWA4_VWB1_WG16_16_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT12_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 4 StaggerUMapping: 0 @@ -23381,6 +24776,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 @@ -23449,7 +24845,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT48x64x32_MI16x16x1_SN_GRVWA2_GRVWB4_K1_LBSPPA128_LBSPPB128_LPA4_LPB4_LRVW4_MIWT3_1_SVW1_VWA1_VWB1_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT48x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 16 @@ -23512,7 +24909,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -23545,8 +24944,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 124 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT48x64x32_MI16x16x1_SN_GRVWA2_GRVWB4_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA128_LBSPPB128_LPA4_LPB4_LRVW4_MIWT3_1_SU0_SUM0_SUS0_SVW1_VWA1_VWB1_WG16_16_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT48x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -23585,6 +24984,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 @@ -23653,7 +25053,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT32x32x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_1_SVW2_VWA2_VWB1_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 16 @@ -23716,7 +25117,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -23749,8 +25152,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 125 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT32x32x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_1_SU0_SUM0_SUS0_SVW2_VWA2_VWB1_WG16_8_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -23789,6 +25192,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 8, 1] WorkGroupMapping: 1 @@ -23857,7 +25261,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT256x128x32_MI32x32x1_SN_GRVWA8_GRVWB8_K1_LBSPPA256_LBSPPB128_LPA8_LPB8_LRVW8_MIWT4_1_SVW4_VWA4_VWB1_WG64_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_8_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 128 @@ -23920,7 +25325,9 @@ MatrixInstM: 32 MatrixInstN: 32 MatrixInstruction: [32, 32, 8, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -23953,8 +25360,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 126 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT256x128x32_MI32x32x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB128_LPA8_LPB8_LRVW8_MIWT4_1_SU0_SUM0_SUS0_SVW4_VWA4_VWB1_WG64_8_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_8_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -23993,6 +25400,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 8, 1] WorkGroupMapping: 1 @@ -24061,7 +25469,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT64x32x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_1_SVW2_VWA2_VWB1_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -24124,7 +25533,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -24157,8 +25568,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 127 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT64x32x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_1_SU0_SUM0_SUS0_SVW2_VWA2_VWB1_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -24197,6 +25608,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 @@ -24265,7 +25677,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT80x64x32_MI16x16x1_SN_GRVWA2_GRVWB4_K1_LBSPPA128_LBSPPB128_LPA4_LPB4_LRVW4_MIWT5_1_SVW1_VWA1_VWB1_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT80x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 16 @@ -24328,7 +25741,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -24361,8 +25776,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 128 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT80x64x32_MI16x16x1_SN_GRVWA2_GRVWB4_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA128_LBSPPB128_LPA4_LPB4_LRVW4_MIWT5_1_SU0_SUM0_SUS0_SVW1_VWA1_VWB1_WG16_16_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT80x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPMn1_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -24401,6 +25816,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 @@ -24469,7 +25885,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT192x256x64_MI16x16x1_SN_GRVWA8_GRVWB8_K1_LBSPPA256_LBSPPB1024_LPA16_LPB16_LRVW8_MIWT6_8_SVW2_VWA2_VWB8_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 32 @@ -24532,7 +25949,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -24565,8 +25984,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 129 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_HAS_UserArgs_MT192x256x64_MI16x16x1_SN_GRVWA8_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB1024_LPA16_LPB16_LRVW8_MIWT6_8_SU0_SUM0_SUS0_SVW2_VWA2_VWB8_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPMn1_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCG80 SourceSwap: true StaggerU: 0 StaggerUMapping: 0 @@ -24605,6 +26024,7 @@ WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 @@ -24634,34 +26054,36 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x32_MI32x3Jk7KegcbCGxUSqp0v8SVRUJS-tt-6f1vibxVouqn6nE= BufferLoad: true BufferStore: true CUCount: null ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false + CodeObjectVersion: 4 + ConvertAfterDS: 0 CustomKernelName: '' DebugStreamK: 0 DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 1 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr + EnableF32XEmulationLds: false EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadPerMfma: 0.27 + GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24673,276 +26095,72 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_GRVWA2_GRVWB8_K1_LBSPPA512_LBSPPB256_LPA4_LPB8_LRVW4_MIWT8_4_SVW8_VWA8_VWB4_WG16_16_1 - LSCA: 32 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM0p27_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW8_LWPM0p80_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA2_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_1 + LDSTrInst: false + LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 64 - LVCA: 16 + LSPA: 32 + LSPB: 16 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25728 + LdsBytesNoAmax: 4608 LdsInitCVgprs: false - LdsNumBytes: 25728 - LdsNumElementsAlignedA: 8320 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 4608 + LdsNumElementsAlignedA: 0 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8320 - LdsOffsetB_Blk: 41088 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 0 + LdsOffsetB_Blk: 8192 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25728 - LdsOffsetMetadata_Blk: 41088 - LdsPadA: 4 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 8192 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 - LocalWritePerMfma: -1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: 0.8 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 4 LoopUnroll: 32 MFMA_BF16_1K: true - MIArchVgpr: 0 - MIBlock: [16, 16, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 8, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 130 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x32_MI16x16x1_SN_GRVWA2_GRVWB8_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA512_LBSPPB256_LPA4_LPB8_LRVW4_MIWT8_4_SU8_SUM0_SUS256_SVW8_VWA8_VWB4_WG16_16_1_WGM16_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 4 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 8 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x32_MI32x3Jk7KegcbCGxUSqp0v8SVRUJS-tt-6f1vibxVouqn6nE= - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: 0 - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 1 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 0.27 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x32_MI32x32x1_SN_K1_MIWT1_2 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 2 - LVCB: 4 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 4608 - LdsInitCVgprs: false - LdsNumBytes: 4608 - LdsNumElementsAlignedA: 0 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 0 - LdsOffsetB_Blk: 8192 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 8192 - LdsPadA: 0 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: 0.8 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 32 - MFMA_BF16_1K: true - MIArchVgpr: false - MIBlock: [32, 32, 8, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxLDS: -1 MaxOccupancy: 40 MbskPrefetchMethod: 0 NoLdsWriteCode: false @@ -24978,8 +26196,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 131 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x32_MI32x32x1_SN_GSU1_GSUC0_GSUWGMRR0_K1_MIWT1_2_SU2_SUM0_SUS256_WGM266_WGMXCC1_WGMXCCG0 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA1_DTVB0_EPS0_FDSI0_GRPM0p27_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW8_LWPM0p80_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD0_NTM0_NEPBS0_NLCA2_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU2_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_1_WGM266_WGMXCC1_WGMXCCG0 SourceSwap: 1 StaggerU: 2 StaggerUMapping: 0 @@ -25094,7 +26312,8 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_AFEM4_AFEM1_ASEM32_K1_MIWT8_4 + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM4_AFEM1_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p27_GRVWA8_GRVWB8_GSUAMBSK_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA2_WSGRB2_WS64_WG16_16_1 + LDSTrInst: false LSCA: 64 LSCB: 64 LSPA: 4 @@ -25158,7 +26377,9 @@ MatrixInstM: 16 MatrixInstN: 16 MatrixInstruction: [16, 16, 16, 1] + MaxLDS: -1 MaxOccupancy: 40 + MbskPrefetchMethod: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false @@ -25191,8 +26412,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 132 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_AFEM4_AFEM1_ASEM32_GSU1_GSUC0_GSUWGMRR0_K1_MIWT8_4_SU4_SUM0_SUS512_WGM0_WGMXCC1_WGMXCCG152 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_UserArgs_MT128x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM4_AFEM1_ASEM32_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM0p27_GRVWA8_GRVWB8_GSU1_GSUAMBSK_GSUC0_GSUWGMRR0_GLS0_ISA942_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL0_PGR2_PLR1_PKA1_SIA3_SS1_SU4_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA8_VWB4_WSGRA2_WSGRB2_WS64_WG16_16_1_WGM0_WGMXCC1_WGMXCCG152 SourceSwap: 1 StaggerU: 4 StaggerUMapping: 0 @@ -25232,6 +26453,7 @@ WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 0 @@ -25300,19 +26522,19 @@ - - [1024, 8192, 1, 8192] - [7, 0.0] - - [7168, 16384, 1, 8192] - - [102, 0.0] + - [99, 0.0] - - [3584, 16384, 1, 8192] - [3, 0.0] - - [1280, 16384, 1, 8192] - - [105, 0.0] + - [102, 0.0] - - [1024, 16384, 1, 8192] - [8, 0.0] - - [7168, 32768, 1, 8192] - - [107, 0.0] + - [104, 0.0] - - [3584, 32768, 1, 8192] - [2, 0.0] - - [1280, 32768, 1, 8192] - - [106, 0.0] + - [103, 0.0] - - [1024, 32768, 1, 8192] - [8, 0.0] - - [7168, 65536, 1, 8192] @@ -25324,319 +26546,315 @@ - - [1024, 65536, 1, 8192] - [1, 0.0] - - [8192, 1, 1, 1024] - - [87, 0.0] + - [84, 0.0] - - [7168, 1, 1, 8192] - - [86, 0.0] + - [83, 0.0] - - [8192, 1, 1, 3584] - - [88, 0.0] + - [85, 0.0] - - [1280, 1, 1, 8192] - - [103, 0.0] + - [100, 0.0] - - [8192, 2, 1, 1024] - - [87, 0.0] + - [84, 0.0] - - [7168, 2, 1, 8192] - - [86, 0.0] + - [83, 0.0] - - [8192, 2, 1, 3584] - - [88, 0.0] + - [85, 0.0] - - [1280, 2, 1, 8192] - - [103, 0.0] + - [100, 0.0] - - [8192, 4, 1, 1024] - - [87, 0.0] + - [84, 0.0] - - [7168, 4, 1, 8192] - - [86, 0.0] + - [83, 0.0] - - [8192, 4, 1, 3584] - - [88, 0.0] + - [85, 0.0] - - [1280, 4, 1, 8192] - - [103, 0.0] + - [100, 0.0] - - [8192, 8, 1, 1024] - - [87, 0.0] + - [84, 0.0] - - [7168, 8, 1, 8192] - - [86, 0.0] + - [83, 0.0] - - [8192, 8, 1, 3584] - - [88, 0.0] + - [85, 0.0] - - [1280, 8, 1, 8192] - - [103, 0.0] + - [100, 0.0] - - [8192, 16, 1, 1024] - - [89, 0.0] - - - [7168, 16, 1, 8192] - [86, 0.0] + - - [7168, 16, 1, 8192] + - [83, 0.0] - - [8192, 16, 1, 3584] - - [88, 0.0] + - [85, 0.0] - - [1280, 16, 1, 8192] - - [104, 0.0] + - [101, 0.0] - - [8192, 32, 1, 1024] - - [92, 0.0] + - [89, 0.0] - - [7168, 32, 1, 8192] - - [91, 0.0] + - [88, 0.0] - - [8192, 32, 1, 3584] - - [93, 0.0] - - - [1280, 32, 1, 8192] - [90, 0.0] + - - [1280, 32, 1, 8192] + - [87, 0.0] - - [8192, 64, 1, 1024] - - [96, 0.0] + - [93, 0.0] - - [7168, 64, 1, 8192] - - [95, 0.0] + - [92, 0.0] - - [8192, 64, 1, 3584] - - [97, 0.0] - - - [1280, 64, 1, 8192] - [94, 0.0] + - - [1280, 64, 1, 8192] + - [91, 0.0] - - [8192, 128, 1, 1024] - - [100, 0.0] + - [97, 0.0] - - [7168, 128, 1, 8192] - - [99, 0.0] + - [96, 0.0] - - [8192, 128, 1, 3584] - - [101, 0.0] - - - [1280, 128, 1, 8192] - [98, 0.0] + - - [1280, 128, 1, 8192] + - [95, 0.0] - - [512, 512, 1, 512] - - [76, 0.0] + - [73, 0.0] - - [1024, 1024, 1, 1024] - - [30, 0.0] + - [28, 0.0] - - [2048, 2048, 1, 2048] - - [67, 0.0] + - [64, 0.0] - - [4096, 4096, 1, 4096] - - [108, 0.0] + - [105, 0.0] - - [4096, 4096, 1, 8192] - - [10, 0.0] + - [9, 0.0] - - [8192, 8192, 1, 8192] - - [111, 0.0] + - [108, 0.0] - - [2, 7168, 1, 8192] - - [11, 0.0] + - [10, 0.0] - - [2, 3584, 1, 8192] - - [12, 0.0] + - [11, 0.0] - - [2, 1280, 1, 8192] - - [13, 0.0] + - [12, 0.0] - - [8192, 7168, 1, 8192] - - [14, 0.0] + - [13, 0.0] - - [8192, 3584, 1, 8192] - - [15, 0.0] + - [14, 0.0] - - [8192, 1280, 1, 8192] - - [16, 0.0] - - - [3584, 2, 1, 8192] - - [9, 0.0] - - - [1024, 2, 1, 8192] - - [17, 0.0] + - [15, 0.0] - - [152710, 2048, 1, 512] - - [18, 0.0] + - [16, 0.0] - - [6, 384, 1, 1536] - - [19, 0.0] + - [17, 0.0] - - [6, 3072, 1, 768] - - [20, 0.0] + - [18, 0.0] - - [462, 768, 1, 384] - - [21, 0.0] + - [19, 0.0] - - [462, 4096, 1, 384] - - [22, 0.0] + - [20, 0.0] - - [462, 1472, 1, 384] - - [23, 0.0] + - [21, 0.0] - - [6, 3072, 1, 1536] - - [19, 0.0] + - [17, 0.0] - - [462, 768, 1, 768] - - [24, 0.0] - - - [462, 4096, 1, 768] - [22, 0.0] + - - [462, 4096, 1, 768] + - [20, 0.0] - - [462, 1472, 1, 768] - - [23, 0.0] + - [21, 0.0] - - [6, 3072, 1, 3072] - - [25, 0.0] + - [23, 0.0] - - [462, 768, 1, 1536] - - [26, 0.0] + - [24, 0.0] - - [462, 4096, 1, 1536] - - [27, 0.0] + - [25, 0.0] - - [462, 1472, 1, 1536] - - [28, 0.0] + - [26, 0.0] - - [16384, 128, 1, 2304] - - [29, 0.0] + - [27, 0.0] - - [1152, 1152, 1, 1152] - - [31, 0.0] + - [29, 0.0] - - [2560, 2560, 1, 2560] - - [32, 0.0] + - [30, 0.0] - - [384, 1536, 1, 12288] - - [33, 0.0] + - [31, 0.0] - - [384, 6144, 1, 1536] - - [34, 0.0] + - [32, 0.0] - - [96, 1536, 1, 1536] - - [35, 0.0] + - [33, 0.0] - - [96, 1536, 1, 12288] - - [36, 0.0] + - [34, 0.0] - - [96, 6144, 1, 1536] - - [37, 0.0] + - [35, 0.0] - - [6144, 384, 1, 3072] - - [38, 0.0] + - [36, 0.0] - - [6144, 1536, 1, 384] - - [39, 0.0] + - [37, 0.0] - - [2048, 131072, 1, 16384] - - [40, 0.0] + - [38, 0.0] - - [1280, 1280, 1, 1280] - - [41, 0.0] + - [39, 0.0] - - [1408, 1408, 1, 1408] - - [42, 0.0] + - [40, 0.0] - - [2688, 2688, 1, 2688] - - [43, 0.0] + - [41, 0.0] - - [3712, 3712, 1, 3712] - - [44, 0.0] + - [42, 0.0] - - [3840, 3840, 1, 3840] - - [45, 0.0] + - [43, 0.0] - - [3968, 3968, 1, 3968] - - [46, 0.0] + - [44, 0.0] - - [3328, 3328, 1, 3328] - - [47, 0.0] + - [45, 0.0] - - [4000, 8192, 1, 8192] - - [48, 0.0] + - [46, 0.0] - - [64896, 2048, 1, 512] - - [49, 0.0] + - [47, 0.0] - - [6144, 384, 1, 384] - - [50, 0.0] + - [48, 0.0] - - [16384, 32768, 1, 2304] - - [51, 0.0] + - [49, 0.0] - - [16384, 128, 1, 13312] - - [52, 0.0] + - [50, 0.0] - - [1536, 1536, 1, 1536] - - [53, 0.0] + - [51, 0.0] - - [1664, 1664, 1, 1664] - - [54, 0.0] + - [52, 0.0] - - [2816, 2816, 1, 2816] - - [55, 0.0] + - [53, 0.0] - - [1536, 768, 1, 768] - - [56, 0.0] + - [54, 0.0] - - [1536, 768, 1, 6144] - - [57, 0.0] + - [55, 0.0] - - [1536, 3072, 1, 768] - - [58, 0.0] + - [56, 0.0] - - [384, 1536, 1, 1536] - - [59, 0.0] + - [57, 0.0] - - [2048, 65536, 1, 16384] - - [60, 0.0] + - [38, 0.0] - - [2048, 8192, 1, 16384] - - [61, 0.0] + - [58, 0.0] - - [1792, 1792, 1, 1792] - - [62, 0.0] + - [59, 0.0] - - [1920, 1920, 1, 1920] - - [63, 0.0] + - [60, 0.0] - - [2944, 2944, 1, 2944] - - [64, 0.0] + - [61, 0.0] - - [16384, 8, 1, 13312] - - [65, 0.0] + - [62, 0.0] - - [16384, 4096, 1, 13312] - - [66, 0.0] + - [63, 0.0] - - [2176, 2176, 1, 2176] - - [68, 0.0] + - [65, 0.0] - - [3072, 3072, 1, 3072] - - [69, 0.0] + - [66, 0.0] - - [2048, 32768, 1, 16384] - - [70, 0.0] + - [67, 0.0] - - [2304, 2304, 1, 2304] - - [71, 0.0] + - [68, 0.0] - - [2432, 2432, 1, 2432] - - [72, 0.0] + - [69, 0.0] - - [3200, 3200, 1, 3200] - - [73, 0.0] + - [70, 0.0] - - [256, 256, 1, 256] - - [74, 0.0] + - [71, 0.0] - - [384, 384, 1, 384] - - [75, 0.0] + - [72, 0.0] - - [640, 640, 1, 640] - - [77, 0.0] + - [74, 0.0] - - [768, 768, 1, 768] - - [78, 0.0] + - [75, 0.0] - - [896, 896, 1, 896] - - [79, 0.0] + - [76, 0.0] - - [3456, 3456, 1, 3456] - - [80, 0.0] + - [77, 0.0] - - [3584, 3584, 1, 3584] - - [81, 0.0] + - [78, 0.0] - - [16384, 2, 1, 2304] - - [82, 0.0] + - [79, 0.0] - - [4000, 2, 1, 8192] - - [85, 0.0] + - [82, 0.0] - - [5560, 1024, 1, 2780] - - [83, 0.0] + - [80, 0.0] - - [2780, 1024, 1, 5560] - - [84, 0.0] + - [81, 0.0] - - [22016, 12288, 1, 4096] - - [109, 0.0] + - [106, 0.0] - - [2048, 2048, 192, 128] - - [110, 0.0] + - [107, 0.0] - - [2048, 122880, 1, 512] - - [112, 0.0] + - [109, 0.0] - - [512, 122880, 1, 2048] - - [113, 0.0] + - [110, 0.0] - - [512, 122880, 1, 512] - - [114, 0.0] + - [111, 0.0] - - [3072, 1024, 32, 1024] - - [115, 159112.0] + - [112, 159112.0] - - [1024, 126976, 1, 512] - - [112, 159096.0] + - [109, 159096.0] - - [168, 51200, 1, 64] - - [116, 33202.8] + - [113, 33202.8] - - [544, 1024, 1, 256] - - [52, 36754.2] + - [50, 36754.2] - - [1216, 1024, 1, 64] - - [117, 23895.6] + - [114, 23895.6] - - [512, 126976, 1, 1024] - - [106, 170263.0] + - [103, 170263.0] - - [128, 1024, 1, 9775] - - [118, 60151.6] + - [115, 60151.6] - - [3072, 1024, 32, 3072] - - [115, 186442.0] + - [112, 186442.0] - - [88, 61440, 1, 256] - - [69, 55766.3] + - [66, 55766.3] - - [128, 512, 1, 32] - - [119, 1078.23] + - [116, 1078.23] - - [200, 15360, 1, 256] - [4, 65590.7] - - [60, 1024, 1, 36] - - [119, 961.67] + - [116, 961.67] - - [128, 1024, 1, 128] - - [77, 7247.18] + - [74, 7247.18] - - [1792, 512, 1, 128] - - [120, 32804.6] + - [117, 32804.6] - - [248, 248, 512, 128] - - [121, 56838.8] + - [118, 56838.8] - - [24, 512, 1, 32] - - [119, 196.118] + - [116, 196.118] - - [768, 1024, 28, 1536] - - [122, 156103.0] + - [119, 156103.0] - - [192, 61440, 1, 256] - - [123, 94816.3] + - [120, 94816.3] - - [176, 1024, 1, 48] - - [124, 3180.42] + - [121, 3180.42] - - [128, 512, 1, 64] - - [125, 2129.09] + - [122, 2129.09] - - [256, 32768, 1, 128] - - [110, 72623.7] + - [107, 72623.7] - - [3072, 1024, 28, 1024] - - [115, 157071.0] + - [112, 157071.0] - - [9775, 1024, 1, 128] - - [126, 69669.9] + - [123, 69669.9] - - [320, 512, 1, 768] - - [78, 34426.6] + - [75, 34426.6] - - [128, 1024, 1, 64] - - [127, 4042.7] + - [124, 4042.7] - - [896, 1024, 32, 1536] - - [115, 153624.0] + - [112, 153624.0] - - [512, 126976, 1, 128] - - [112, 98421.0] + - [109, 98421.0] - - [704, 512, 1, 128] - - [117, 15586.9] + - [114, 15586.9] - - [1179, 1024, 1, 64] - - [128, 18418.8] + - [125, 18418.8] - - [1536, 1024, 28, 1024] - - [122, 146789.0] + - [119, 146789.0] - - [104, 61440, 1, 256] - - [69, 64963.4] + - [66, 64963.4] - - [1536, 1024, 32, 1024] - - [115, 153084.0] + - [112, 153084.0] - - [896, 1024, 32, 3072] - - [129, 170029.0] + - [126, 170029.0] - - [1032, 1024, 1, 128] - - [58, 31384.3] + - [56, 31384.3] - - [2048, 2048, 64, 128] - - [130, 0.0] + - [111, 0.0] - - [2048, 2048, 64, 192] - - [130, 0.0] + - [111, 0.0] - - [32, 32, 6144, 64] - - [131, 0.0] + - [127, 0.0] - - [248, 248, 512, 1024] - - [132, 0.0] + - [128, 0.0] - null - null - DeviceEfficiency