From 67dae2fdea2f938ffa39a6b09abafc07b010413a Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Fri, 7 Nov 2025 07:31:41 +0000 Subject: [PATCH 1/2] port kernel --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 77 +++++++++---------- 1 file changed, 35 insertions(+), 42 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index baa3f821d9b..b04fa34e208 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -26328,20 +26328,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x224x64_MI16fwZD_SxDnnVOKmLpTDrPrhupLHmkAIN1IXhJjWIM824= + BaseName: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1J2ed5Fhxf8DLJDiYbpPN48d6A8VIyjP0ZhnMLknaejw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 + ClusterLocalRead: 1 + CodeObjectVersion: '4' ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -26370,7 +26370,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 LSCB: 64 @@ -26381,13 +26381,13 @@ LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 3072 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126208 + LdsBytesNoAmax: 124160 LdsInitCVgprs: false - LdsNumBytes: 126208 + LdsNumBytes: 124160 LdsNumElementsAlignedA: 24832 - LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -26406,11 +26406,11 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26419,14 +26419,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [6, 7] + MIWaveTile: [6, 8] MIWaveTileA: 6 - MIWaveTileB: 7 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 192 - MacroTile1: 224 + MacroTile1: 256 MacroTileA: 192 - MacroTileB: 224 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26439,14 +26439,14 @@ MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 - MfmaInitCVgprs: false + MfmaInitCVgprs: true NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: true NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 4 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 @@ -26454,14 +26454,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 168 - NumGlobalWriteVectorsPerThread: 168 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 NumLoadsA: 6 - NumLoadsB: 7 + NumLoadsB: 8 NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26472,27 +26472,22 @@ PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 112 - SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 @@ -26502,9 +26497,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 24 - ThreadTile1: 7 + ThreadTile1: 8 ThreadTileA: 24 - ThreadTileB: 7 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26513,24 +26508,24 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: true UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 + WorkGroupMapping: 16 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -26540,13 +26535,11 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 enableLDSTrA: true enableLDSTrB: false reorderGRInstForDTVA: false From 2b82f8e4a96bc52bc83c55fe2fa2bd360023e755 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Mon, 10 Nov 2025 01:35:56 +0000 Subject: [PATCH 2/2] add test --- .../Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index 8d0f8886001..c58e20becaf 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -110,6 +110,7 @@ BenchmarkProblems: BenchmarkFinalParameters: - ProblemSizes: - Range: [[255, 1, 257], [255, 1, 257], [1], [1, 37, 192]] + - Exact: [3264, 3072, 1, 1536] - BiasTypeArgs: ['b'] - # BenchmarkProblemSizeGroup - Standard - All problem InitialSolutionParameters: