From 7005142f8b5f02759ac40acf2feefcdb83b6c9c4 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Fri, 14 Nov 2025 10:45:22 +0000 Subject: [PATCH 01/10] cms 256x192x64TN --- .../Tensile/Components/CustomSchedule.py | 56 ++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index d3e7f275c0e..23e40b93b4b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -352,6 +352,58 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): opt1 = ScheduleInfo(2, numMfma, optSchedule, syncCode, nglshift, nllshift) return True, opt1 +def _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS): + kernel["MfmaInitCVgprs"] = True + + optSchedule = dict() + syncCode = [] + nglshift = nllshift = 0 # vmcnt shift for ngl and nll + if isTN(kernel) and not useLDSTr and TLDS == 1: + #index and code pair + syncTable = [ + -1, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="wait for LRB1-0"), + 7, SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="wait for LRB1-1"), + 10, SBarrier(comment="for GRA"), + 15, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="wait for LRB1-2"), + 23, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="wait for LRB1 remaining"), + 50, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"), + 50, SBarrier(comment="for LRA1"), + 70, SWaitCnt(dscnt=-1, vlcnt=12, vscnt=-1, comment="for LRB1"), + 70, SBarrier(comment="for LRB1"), + ] + optSchedule = { + 'SYNC' : [syncTable[::2]], + 'GRIncA': [[0,1,2,3,4,5,6,7,8]], + 'GRIncB': [[9,10,11,12,13,14,15,16,17]], + + 'LRA0': [[0, 2, 3, 4, 5, 6, 7, 8]], + #interleave LRB0 , GRA + 'LRB0': [[9, 11, 13, 15, 17, 19], + [10, 12, 14, 16, 18, 20]], + 'GRA': [[10,10, 12,12, 14,14, 16,16, 20,20, 31,31, 33,33, 35,35], + [11,11, 13,13, 15,15, 17,17, 21,21, 32,32, 34,34, 36,36]], + #interleave GRB, LRB1 + 'GRB': [[51,51, 55,55, 59,59, 63,63, 83,83, 85,85], + [52,52, 56,56, 60,60, 64,64, 84,84, 86,86]], + 'LRA1': [[50, 52, 57, 60, 62, 64, 66, 68], + [51, 53, 58, 61, 63, 65, 67, 69]], + + 'LRB1': [[70, 72, 74, 76, 78, 79]], + 'LRSA': [[20]], + 'LRSB': [[64]], + 'LWSA': [[40]], + 'LWSB': [[90]], + 'LCC' : [[95, 95]], + } + syncCode = syncTable[1::2] + nglshift = nllshift = 14 # vmcnt shift for ngl and nll + else: + return False, None + + numMfma = 96 + opt1 = ScheduleInfo(2, numMfma, optSchedule, syncCode, nglshift, nllshift) + return True, opt1 + def _get_schedule_256x256x128_8bit(kernel, useLDSTr, TLDS): kernel["MfmaInitCVgprs"] = True @@ -611,7 +663,7 @@ def hasCustomSchedule(kernel): is192x256x64DTL = [MT0, MT1, DU, PGR, PLR, DTL] == [192, 256, 64, 2, 1, True] is256x256x128DTL = [MT0, MT1, DU, PGR, PLR, DTL] == [256, 256, 128, 2, 0, True] is256x160x64DTL = [MT0, MT1, DU, PGR, PLR, DTL] == [256, 160, 64, 2, 1, True] - + is256x192x64DTL = [MT0, MT1, DU, PGR, PLR, DTL] == [256, 192, 64, 2, 1, True] if is256x256x64DTL and is16bit and not isMixed and ([GRVWA, GRVWB, LRVW] == [8,8,8]) and MI == [16,16,32,1] and MIWG == [2,2]: return _get_schedule_256x256x64_16bit(kernel, useLDSTr, TLDS) elif is256x256x128DTL and is8bit and not isMixed and ([GRVWA, GRVWB, LRVW] == [16, 16, 16]) and MI == [16,16,128,1] and MIWG == [2,2]: @@ -620,5 +672,7 @@ def hasCustomSchedule(kernel): return _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS) elif is256x160x64DTL and is16bit and not isMixed and ([GRVWA, GRVWB, LRVW] == [8,8,8]) and MI == [16,16,32,1] and MIWG == [2,2]: return _get_schedule_256x160x64_16bit(kernel, useLDSTr, TLDS) + elif is256x192x64DTL and is16bit and not isMixed and ([GRVWA, GRVWB, LRVW] == [8, 8, 8]) and MI == [16,16,32,1] and MIWG == [2,2]: + return _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS) return False, None From 78bd1ea4f3ea5644d2644add1684e1622f18d4cf Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Mon, 17 Nov 2025 06:45:18 +0000 Subject: [PATCH 02/10] add test and yaml --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 245 ++++++++++++++++++ .../Tensile/Components/CustomSchedule.py | 20 +- .../gfx950/custom_mainloop_scheduling.yaml | 37 ++- 3 files changed, 290 insertions(+), 12 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index c434ab07d59..3e76dc9ba53 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -306096,6 +306096,249 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: 0 + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124672 + LdsInitCVgprs: false + LdsNumBytes: 124672 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 25344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: true + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 6 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1308 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [16, 368640, 1, 224] - [0, 0.0] @@ -308735,6 +308978,8 @@ - [1305, 0.0] - - [1920, 2048, 1, 15964] - [1306, 0.0] + - - [4096, 3072, 1, 8192] + - [1308, 0.0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 23e40b93b4b..81712f20589 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -360,17 +360,15 @@ def _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS): nglshift = nllshift = 0 # vmcnt shift for ngl and nll if isTN(kernel) and not useLDSTr and TLDS == 1: #index and code pair - syncTable = [ - -1, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="wait for LRB1-0"), - 7, SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="wait for LRB1-1"), - 10, SBarrier(comment="for GRA"), - 15, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="wait for LRB1-2"), - 23, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="wait for LRB1 remaining"), - 50, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"), - 50, SBarrier(comment="for LRA1"), - 70, SWaitCnt(dscnt=-1, vlcnt=12, vscnt=-1, comment="for LRB1"), - 70, SBarrier(comment="for LRB1"), - ] + syncTable = [-1, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="wait for LRB1-0"), + 7, SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="wait for LRB1-1"), + 10, SBarrier(comment="for GRA"), + 15, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="wait for LRB1-2"), + 23, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="wait for LRB1 remaining"), + 50, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"), + 50, SBarrier(comment="for LRA1"), + 70, SWaitCnt(dscnt=-1, vlcnt=12, vscnt=-1, comment="for LRB1"), + 70, SBarrier(comment="for LRB1"),] optSchedule = { 'SYNC' : [syncTable[::2]], 'GRIncA': [[0,1,2,3,4,5,6,7,8]], diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index 097351d292f..1cba912f2a7 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -427,7 +427,42 @@ BenchmarkProblems: - Range: [[613], [612], [1], [1, 1, 64]] - Exact: [8192, 4096, 1, 2048] - BiasTypeArgs: ['b'] - + - # BenchmarkProblemSizeGroup - Standard - All problem + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16,32, 1, 1, 8, 6, 2,2 ] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - DepthU: [64] + - ScheduleIterAlg: [3] + - ExpandPointerSwap: [0] + - TransposeLDS: [1] #0,1 + - LocalReadVectorWidth: [8] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - DirectToLds: [1] + - StreamK: [3] + - LdsPadA: [8] #[-1] + - LdsPadB: [8] #[-1] + - StaggerU: [0] + - WorkGroupMapping: [16] + - WorkGroupMappingXCC: [2] + - 1LDSBuffer: [0] + - NonTemporalD: [4] + - SourceSwap: [1] + - UseSgprForGRO: [0] + - UseCustomMainLoopSchedule: [0, 1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [[256], [192], [1], [64, 64, 256]] + - Range: [[256], [192], [1], [1, 1, 64]] + - Range: [[256], [192], [1], [32, 64, 256]] + - Exact: [4096, 3072, 1, 8192] + - BiasTypeArgs: ['b'] ######################################## # HHS TN - standard ######################################## From 5ff7b653132c50f1ff7915612b818dc453aa9771 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Tue, 18 Nov 2025 08:39:37 +0000 Subject: [PATCH 03/10] cms for 256x192x64NT --- ...k_Bjlk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 246 ++++++++++++++++++ .../Tensile/Components/CustomSchedule.py | 32 +++ .../gfx950/custom_mainloop_scheduling.yaml | 111 +++++--- 3 files changed, 352 insertions(+), 37 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index addf1dc5f4d..5d526b04ddd 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -106324,6 +106324,250 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: 0 + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT2I0V6aAbNGrvu4MPtZRYDzSPuJsk8fmuhKPit68BvyOA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 192 + LSPA: 8 + LSPB: 11 + LVCA: 32 + LVCB: 24 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 6 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 452 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: true + enableLDSTrB: true + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [160, 120, 1, 3072] - [0, 0.0] @@ -107239,6 +107483,8 @@ - [449, 0.0] - - [36864, 4096, 1, 4096] - [451, 0.0] + - - [4096, 3072, 1, 8192] + - [452, 0.0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 1a6973f24b6..c9e42ac44d2 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -436,6 +436,38 @@ def _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS): } syncCode = syncTable[1::2] nglshift = nllshift = 14 # vmcnt shift for ngl and nll + if isNT(kernel) and useLDSTr and TLDS == 0: + #index and code pair + syncTable = [-1, SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="for LRB1-0"), + 7, SWaitCnt(dscnt=8+7, vlcnt=-1, vscnt=-1, comment="for LRB1-1"), + 15, SWaitCnt(dscnt=8+7, vlcnt=-1, vscnt=-1, comment="for LRB1-2"), + 17, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for GRA"), + 17, SBarrier(comment="for GRA"), + 54, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="wait for previous set of global reads"), + 54, SBarrier(comment="for GRB"),] + optSchedule = { + 'SYNC' : [syncTable[::2]], + 'GRIncA': [[0,1,2,3,4,5,6,7,8]], + 'GRIncB': [[9,10,11,12,13,14,15,16,17]], + + 'LRA0': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]], + 'LRB0': [[16,16, 18,18, 20,20, 22,22, 24,24, 26,26], + [17,17, 19,19, 21,21, 23,23, 25,25, 27,27]], + 'GRA': [[17,17, 19,19, 21,21, 23,23, 25,25, 30,30, 35,35, 40,40]], + + 'GRB': [[54,54, 59,59, 63,63, 67,67, 71,71, 73,73]], + 'LRA1': [[55,55, 57,57, 60,60, 62,62, 64,64, 66,66, 68,68, 72,72]], + + 'LRB1': [[73,73, 75,75, 77,77, 78,78, 79,79, 80,80]], + 'LRSA': [[46]], + 'LRSB': [[46]], + 'LWSA': [[76]], + 'LWSB': [[78]], + 'LCC' : [[95, 95]], + } + syncCode = syncTable[1::2] + nglshift = nllshift = 14 # vmcnt shift for ngl and nll else: return False, None diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index 96c6191599a..7db84499edd 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -506,42 +506,42 @@ BenchmarkProblems: - Range: [[6144], [8192], [1], [64, 64, 256]] - Exact: [3072, 4096, 1, 8192] - BiasTypeArgs: ['b'] - - # BenchmarkProblemSizeGroup - Standard - All problem - InitialSolutionParameters: - BenchmarkCommonParameters: - - KernelLanguage: ["Assembly"] - ForkParameters: - - MatrixInstruction: - - [16, 16,32, 1, 1, 8, 6, 2,2 ] - - PrefetchGlobalRead: [2] - - PrefetchLocalRead: [1] - - DepthU: [64] - - ScheduleIterAlg: [3] - - ExpandPointerSwap: [0] - - TransposeLDS: [1] #0,1 - - LocalReadVectorWidth: [8] - - GlobalReadVectorWidthA: [8] - - GlobalReadVectorWidthB: [8] - - DirectToLds: [1] - - StreamK: [3] - - LdsPadA: [8] #[-1] - - LdsPadB: [8] #[-1] - - StaggerU: [0] - - WorkGroupMapping: [16] - - WorkGroupMappingXCC: [2] - - 1LDSBuffer: [0] - - NonTemporalD: [4] - - SourceSwap: [1] - - UseSgprForGRO: [0] - - UseCustomMainLoopSchedule: [0, 1] - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [[256], [192], [1], [64, 64, 256]] - - Range: [[256], [192], [1], [1, 1, 64]] - - Range: [[256], [192], [1], [32, 64, 256]] - - Exact: [4096, 3072, 1, 8192] - - BiasTypeArgs: ['b'] + - # BenchmarkProblemSizeGroup - Standard - All problem + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16,32, 1, 1, 8, 6, 2,2 ] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - DepthU: [64] + - ScheduleIterAlg: [3] + - ExpandPointerSwap: [0] + - TransposeLDS: [1] #0,1 + - LocalReadVectorWidth: [8] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - DirectToLds: [1] + - StreamK: [3] + - LdsPadA: [8] #[-1] + - LdsPadB: [8] #[-1] + - StaggerU: [0] + - WorkGroupMapping: [16] + - WorkGroupMappingXCC: [2] + - 1LDSBuffer: [0] + - NonTemporalD: [4] + - SourceSwap: [1] + - UseSgprForGRO: [0] + - UseCustomMainLoopSchedule: [0, 1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [[256], [192], [1], [64, 64, 256]] + - Range: [[256], [192], [1], [1, 1, 64]] + - Range: [[256], [192], [1], [32, 64, 256]] + - Exact: [4096, 3072, 1, 8192] + - BiasTypeArgs: ['b'] ######################################## # HHS TN - standard ######################################## @@ -765,7 +765,44 @@ BenchmarkProblems: - ProblemSizes: - Exact: [8192, 4096, 1, 2048] - BiasTypeArgs: ['b'] - + - # BenchmarkProblemSizeGroup - Standard - All problem + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16,32, 1, 1, 8, 6, 2,2 ] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - DepthU: [64] + - ScheduleIterAlg: [3] + - ExpandPointerSwap: [0] + - LocalReadVectorWidth: [8] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - DirectToLds: [1] + - LDSTrInst: [1] + - TransposeLDS: [0] + - StreamK: [3] + - LdsPadA: [8] #[-1] + - LdsPadB: [8] #[-1] + - StaggerU: [0] + - WorkGroupMapping: [16] + - WorkGroupMappingXCC: [2] + - 1LDSBuffer: [0] + - NonTemporalD: [4] + - SourceSwap: [1] + - UseSgprForGRO: [0] + - UseCustomMainLoopSchedule: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [[256], [192], [1], [64, 64, 256]] + - Range: [[256], [192], [1], [1, 1, 64]] + - Range: [[256], [192], [1], [32, 64, 256]] + - Range: [[8192], [6144], [1], [64, 64, 256]] + - Exact: [4096, 3072, 1, 8192] + - BiasTypeArgs: ['b'] ######################################## # HHS NT - standard ######################################## From 391240059340fc0222bb7ac2d4d4513288a39260 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 19 Nov 2025 01:46:07 +0000 Subject: [PATCH 04/10] CMS for 256x192x64NN --- .../Tensile/Components/CustomSchedule.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index c9e42ac44d2..e0addeba332 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -468,6 +468,40 @@ def _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS): } syncCode = syncTable[1::2] nglshift = nllshift = 14 # vmcnt shift for ngl and nll + if isNN(kernel) and not useLDSTr and TLDS == 1: + #index and code pair + + syncTable = [ + -1, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="LRB1-0"), + 7, SWaitCnt(dscnt=11, vlcnt=-1, vscnt=-1, comment="LRB1-1"), + 8, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="LRB0"), + 8, SBarrier(comment=""), + 15, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="LRB1 remaining"), + 54, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="wait for previous set of global reads"), + 54, SBarrier(comment="LRA1"), + ] + optSchedule = { + 'SYNC' : [syncTable[::2]], + 'GRIncA': [[0,1,2,3,4,5,6,7,8]], + 'GRIncB': [[9,10,11,12,13,14,15,16,17]], + + 'LRA0': [[0, 1, 2, 3, 4, 5, 6, 7]], + 'LRB0': [[8, 10, 12, 14, 16, 18]], + 'GRA': [[9,9, 18,18, 26,26, 30,30, 34,34, 40,40, 42,42, 44,44]], + 'GRB': [[54,54, 58,58, 62,62, 66,66, 70,70, 74,74]], + 'LRA1': [[55, 57, 59, 61, 63, 65, 67, 71]], + 'LRB1': [[72, 74, 76, 78, 80, 82]], + 'LRSA': [[46]], + 'LRSB': [[46]], + 'LWSA': [[33]], + 'LWSB': [[80]], + 'PackA0': [[47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52, 52, 53, 53, 53, 53, 53, 53]], + 'PackA1': [[-1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5]], + + 'LCC' : [[95, 95]], + } + syncCode = syncTable[1::2] + nglshift = nllshift = 14 # vmcnt shift for ngl and nll else: return False, None From 5bb085acd1b9370e662b63adfa4317c2ab6f1a74 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 19 Nov 2025 10:16:12 +0000 Subject: [PATCH 05/10] typo --- .../Tensile/Components/CustomSchedule.py | 70 +------------------ .../gfx950/custom_mainloop_scheduling.yaml | 1 + 2 files changed, 2 insertions(+), 69 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 142fb20bf86..03b1158cbe2 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -358,75 +358,7 @@ def _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS): optSchedule = dict() syncCode = [] nglshift = nllshift = 0 # vmcnt shift for ngl and nll - if isTN(kernel) and not useLDSTr and TLDS == 1: - #index and code pair - syncTable = [-1, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="wait for LRB1-0"), - 7, SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="wait for LRB1-1"), - 10, SBarrier(comment="for GRA"), - 15, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="wait for LRB1-2"), - 23, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="wait for LRB1 remaining"), - 50, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"), - 50, SBarrier(comment="for LRA1"), - 70, SWaitCnt(dscnt=-1, vlcnt=12, vscnt=-1, comment="for LRB1"), - 70, SBarrier(comment="for LRB1"),] - optSchedule = { - 'SYNC' : [syncTable[::2]], - 'GRIncA': [[0,1,2,3,4,5,6,7,8]], - 'GRIncB': [[9,10,11,12,13,14,15,16,17]], - - 'LRA0': [[0, 2, 3, 4, 5, 6, 7, 8]], - #interleave LRB0 , GRA - 'LRB0': [[9, 11, 13, 15, 17, 19], - [10, 12, 14, 16, 18, 20]], - 'GRA': [[10,10, 12,12, 14,14, 16,16, 20,20, 31,31, 33,33, 35,35], - [11,11, 13,13, 15,15, 17,17, 21,21, 32,32, 34,34, 36,36]], - #interleave GRB, LRB1 - 'GRB': [[51,51, 55,55, 59,59, 63,63, 83,83, 85,85], - [52,52, 56,56, 60,60, 64,64, 84,84, 86,86]], - 'LRA1': [[50, 52, 57, 60, 62, 64, 66, 68], - [51, 53, 58, 61, 63, 65, 67, 69]], - - 'LRB1': [[70, 72, 74, 76, 78, 79]], - 'LRSA': [[20]], - 'LRSB': [[64]], - 'LWSA': [[40]], - 'LWSB': [[90]], - 'LCC' : [[95, 95]], - } - syncCode = syncTable[1::2] - nglshift = nllshift = 14 # vmcnt shift for ngl and nll - if isNT(kernel) and useLDSTr and TLDS == 0: - #index and code pair - syncTable = [-1, SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="for LRB1-0"), - 7, SWaitCnt(dscnt=8+7, vlcnt=-1, vscnt=-1, comment="for LRB1-1"), - 15, SWaitCnt(dscnt=8+7, vlcnt=-1, vscnt=-1, comment="for LRB1-2"), - 17, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for GRA"), - 17, SBarrier(comment="for GRA"), - 54, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="wait for previous set of global reads"), - 54, SBarrier(comment="for GRB"),] - optSchedule = { - 'SYNC' : [syncTable[::2]], - 'GRIncA': [[0,1,2,3,4,5,6,7,8]], - 'GRIncB': [[9,10,11,12,13,14,15,16,17]], - - 'LRA0': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]], - 'LRB0': [[16,16, 18,18, 20,20, 22,22, 24,24, 26,26], - [17,17, 19,19, 21,21, 23,23, 25,25, 27,27]], - 'GRA': [[17,17, 19,19, 21,21, 23,23, 25,25, 30,30, 35,35, 40,40]], - - 'GRB': [[54,54, 59,59, 63,63, 67,67, 71,71, 73,73]], - 'LRA1': [[55,55, 57,57, 60,60, 62,62, 64,64, 66,66, 68,68, 72,72]], - - 'LRB1': [[73,73, 75,75, 77,77, 78,78, 79,79, 80,80]], - 'LRSA': [[46]], - 'LRSB': [[46]], - 'LWSA': [[76]], - 'LWSB': [[78]], - 'LCC' : [[95, 95]], - } - syncCode = syncTable[1::2] - nglshift = nllshift = 14 # vmcnt shift for ngl and nll + if isNN(kernel) and not useLDSTr and TLDS == 1: #index and code pair diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index eb277faa3b3..cd1f965d85b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -470,6 +470,7 @@ BenchmarkProblems: - Range: [[613], [612], [1], [1, 1, 64]] - Exact: [8192, 4096, 1, 2048] - BiasTypeArgs: ['b'] + ######################################## # HHS TN - standard ######################################## From 9c0e14e712c043a8d2ee48dd06d35920451cb8f1 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 19 Nov 2025 10:38:28 +0000 Subject: [PATCH 06/10] add test --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 246 ++++++++++++++++++ .../gfx950/custom_mainloop_scheduling.yaml | 42 +++ 2 files changed, 288 insertions(+) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index 74a746c0a97..b34109187f0 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -129817,6 +129817,250 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x19X1T2nrREg61DNmVW2UowK05QH9S5f463dYbDIWwItXg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 64 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123264 + LdsInitCVgprs: false + LdsNumBytes: 123264 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 24960 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 6 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 549 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [112, 491520, 1, 128] - [0, 0.0] @@ -130920,6 +131164,8 @@ - [547, 0.0] - - [2560, 4096, 1, 8192] - [548, 0.0] + - - [4096, 3072, 1, 8192] + - [549, 0.0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index cd1f965d85b..3cf4739568e 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -229,6 +229,48 @@ BenchmarkProblems: - Range: [[256], [160], [1], [32, 64, 256]] - Range: [[8192], [5120], [1], [64, 64, 256]] - BiasTypeArgs: ['b'] + - # BenchmarkProblemSizeGroup - Standard - All problem + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16,32, 1, 1, 8, 6, 2,2 ] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - DepthU: [64] + - ScheduleIterAlg: [3] + - ExpandPointerSwap: [0] + - TransposeLDS: [1] + - LocalReadVectorWidth: [8] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - DirectToLds: [1] + - StreamK: [3] + - LdsPadA: [-1] + - LdsPadB: [-1] + - StaggerU: [0] + - 1LDSBuffer: [0] + - NonTemporalA: [3] + - NonTemporalB: [3] + - NonTemporalD: [4] + - SourceSwap: [1] + - LdsPadA: [-1] + - LdsPadB: [-1] + - LdsBlockSizePerPadA: [-1] + - LdsBlockSizePerPadB: [-1] + - LDSTrInst: [0] + - UseSgprForGRO: [0] + - UseCustomMainLoopSchedule: [0,1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4096, 3072, 1, 8192] + - Range: [[256], [192], [1], [64, 64, 256]] + - Range: [[256], [192], [1], [1,1,64]] + - Range: [[256], [192], [1], [32, 64, 256]] + - Range: [[8192], [6144], [1], [64, 64, 256]] + - BiasTypeArgs: ['b'] ######################################## # HHS NN - standard ######################################## From 7239869b4a2a7d9fa900722dff0c8dbd50f6c18d Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Thu, 20 Nov 2025 10:48:32 +0000 Subject: [PATCH 07/10] refine s_wait --- .../Tensile/Components/CustomSchedule.py | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 03b1158cbe2..2a2ab1b195f 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -362,26 +362,35 @@ def _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS): if isNN(kernel) and not useLDSTr and TLDS == 1: #index and code pair - syncTable = [ - -1, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="LRB1-0"), - 7, SWaitCnt(dscnt=11, vlcnt=-1, vscnt=-1, comment="LRB1-1"), - 8, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="LRB0"), - 8, SBarrier(comment=""), - 15, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="LRB1 remaining"), - 54, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="wait for previous set of global reads"), - 54, SBarrier(comment="LRA1"), - ] + syncTable = [-1, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="LRB1-0"), + 7, SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="LRB1-1"), + 8, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="LRB0"), + 8, SBarrier(comment=""), + 15, SWaitCnt(dscnt=11, vlcnt=-1, vscnt=-1, comment="LRB1-remaining"), + 47, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="LRB0-0"), + 54, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="wait for previous GRA"), + 54, SBarrier(comment="LRA1"), + 55, SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="LRB0-1"), + 63, SWaitCnt(dscnt=4+2, vlcnt=-1, vscnt=-1, comment="LRB0-2"), + 71, SWaitCnt(dscnt=7+1, vlcnt=-1, vscnt=-1, comment="LRB0-3"), + 79, SWaitCnt(dscnt=11, vlcnt=-1, vscnt=-1, comment="LRB0-remaining"),] optSchedule = { 'SYNC' : [syncTable[::2]], 'GRIncA': [[0,1,2,3,4,5,6,7,8]], 'GRIncB': [[9,10,11,12,13,14,15,16,17]], - 'LRA0': [[0, 1, 2, 3, 4, 5, 6, 7]], - 'LRB0': [[8, 10, 12, 14, 16, 18]], - 'GRA': [[9,9, 18,18, 26,26, 30,30, 34,34, 40,40, 42,42, 44,44]], - 'GRB': [[54,54, 58,58, 62,62, 66,66, 70,70, 74,74]], - 'LRA1': [[55, 57, 59, 61, 63, 65, 67, 71]], - 'LRB1': [[72, 74, 76, 78, 80, 82]], + 'LRA0': [[0, 1, 2, 3, 4, 5, 6, 7], + [1, 2, 3, 4, 5, 6, 7, 8]], + 'LRB0': [[8, 10, 12, 14, 16, 18], + [9, 11, 13, 15, 17, 19]], + 'GRA': [[9,9, 18,18, 26,26, 30,30, 34,34, 40,40, 42,42, 44,44], + [10,10, 19,19, 27,27, 31,31, 35,35, 41,41, 43,43, 45,45]], + 'GRB': [[54,54, 58,58, 62,62, 66,66, 70,70, 74,74], + [55,55, 59,59, 63,63, 67,67, 71,71, 75,75]], + 'LRA1': [[55, 57, 59, 61, 63, 65, 67, 71], + [56, 58, 60, 62, 64, 66, 68, 72]], + 'LRB1': [[72, 74, 76, 78, 80, 82], + [73, 75, 77, 79, 81, 83]], 'LRSA': [[46]], 'LRSB': [[46]], 'LWSA': [[33]], From ea4cba21aeee74adf951dba8acc112f972c01760 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 26 Nov 2025 06:29:05 +0000 Subject: [PATCH 08/10] refine --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 40 +++++----- .../Tensile/Components/CustomSchedule.py | 74 +++++++++---------- .../gfx950/custom_mainloop_scheduling.yaml | 2 +- 3 files changed, 56 insertions(+), 60 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index b34109187f0..928dac79c06 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -129828,7 +129828,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x19X1T2nrREg61DNmVW2UowK05QH9S5f463dYbDIWwItXg= + BaseName: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x19efg19fI0a0QQO2NPQwOSTE-N35cXpB90pMQj9VJB-ns= BufferLoad: true BufferStore: true CUCount: null @@ -129859,7 +129859,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 8 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -129871,8 +129871,8 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 256 LSCB: 64 LSPA: 8 @@ -129884,22 +129884,22 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123264 + LdsBytesNoAmax: 123776 LdsInitCVgprs: false - LdsNumBytes: 123264 - LdsNumElementsAlignedA: 32768 + LdsNumBytes: 123776 + LdsNumElementsAlignedA: 33280 LdsNumElementsAlignedB: 24960 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 8 @@ -129940,7 +129940,7 @@ MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 - MfmaInitCVgprs: false + MfmaInitCVgprs: true NoLdsWriteCode: true NoReject: false NoTailLoop: false @@ -129956,7 +129956,7 @@ NonTemporalWS: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 24 + NumGlobalWriteVectorsPerThread: 192 NumLoadsA: 8 NumLoadsB: 6 NumLoadsCoalescedA: 1 @@ -129982,7 +129982,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 549 - SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW8_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA8_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -129992,7 +129992,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 8 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -130002,7 +130002,7 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false - SwapGlobalReadOrder: false + SwapGlobalReadOrder: true ThreadTile: [1, 1] ThreadTile0: 32 ThreadTile1: 6 @@ -130029,7 +130029,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 8 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -130054,8 +130054,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: true + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 005e293c745..2e62ebd80ef 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -397,47 +397,43 @@ def _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS): syncCode = [] nglshift = nllshift = 0 # vmcnt shift for ngl and nll - if isNN(kernel) and not useLDSTr and TLDS == 1: + if isNN(kernel) and useLDSTr and TLDS == 1: + kernel["SwapGlobalReadOrder"] = True #index and code pair - - syncTable = [-1, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="LRB1-0"), - 7, SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="LRB1-1"), - 8, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="LRB0"), - 8, SBarrier(comment=""), - 15, SWaitCnt(dscnt=11, vlcnt=-1, vscnt=-1, comment="LRB1-remaining"), - 47, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="LRB0-0"), - 54, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="wait for previous GRA"), - 54, SBarrier(comment="LRA1"), - 55, SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="LRB0-1"), - 63, SWaitCnt(dscnt=4+2, vlcnt=-1, vscnt=-1, comment="LRB0-2"), - 71, SWaitCnt(dscnt=7+1, vlcnt=-1, vscnt=-1, comment="LRB0-3"), - 79, SWaitCnt(dscnt=11, vlcnt=-1, vscnt=-1, comment="LRB0-remaining"),] + syncTable = [-1, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="wait for LRA1"), + 15, SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="wait for LRB0"), + 15, SBarrier(comment=""), + 46, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment=""), + 51, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="wait for previous set of global reads"), + 51, SBarrier(comment=""), + 63, SWaitCnt(dscnt=-1, vlcnt=14-4, vscnt=-1, comment="wait for previous set of global reads"), + 63, SBarrier(comment=""), + ] optSchedule = { - 'SYNC' : [syncTable[::2]], - 'GRIncA': [[0,1,2,3,4,5,6,7,8]], - 'GRIncB': [[9,10,11,12,13,14,15,16,17]], - - 'LRA0': [[0, 1, 2, 3, 4, 5, 6, 7], - [1, 2, 3, 4, 5, 6, 7, 8]], - 'LRB0': [[8, 10, 12, 14, 16, 18], - [9, 11, 13, 15, 17, 19]], - 'GRA': [[9,9, 18,18, 26,26, 30,30, 34,34, 40,40, 42,42, 44,44], - [10,10, 19,19, 27,27, 31,31, 35,35, 41,41, 43,43, 45,45]], - 'GRB': [[54,54, 58,58, 62,62, 66,66, 70,70, 74,74], - [55,55, 59,59, 63,63, 67,67, 71,71, 75,75]], - 'LRA1': [[55, 57, 59, 61, 63, 65, 67, 71], - [56, 58, 60, 62, 64, 66, 68, 72]], - 'LRB1': [[72, 74, 76, 78, 80, 82], - [73, 75, 77, 79, 81, 83]], - 'LRSA': [[46]], - 'LRSB': [[46]], - 'LWSA': [[33]], - 'LWSB': [[80]], - 'PackA0': [[47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52, 52, 53, 53, 53, 53, 53, 53]], - 'PackA1': [[-1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5]], - - 'LCC' : [[95, 95]], - } + 'SYNC' : [syncTable[::2]], + 'GRIncA': [[0,1,2,3,4,5,6,7,8]], + 'GRIncB': [[9,10,11,12,13,14,15,16,17]], + + 'LRB0': [[-1, 0, 1, 2, 3, 4], + [0, 1, 2, 3, 4, 5]], + 'LRA0': [[6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 25, 27, 29, 31, 33, 35], + [7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 26, 28, 30, 32, 34, 36]], + 'GRA': [[15,15, 17,17, 27,27, 29,29, 31,31, 33,33], + [16, 16, 18, 18, 28, 28, 30, 30, 32, 32, 34, 34]], + + 'GRB': [[50,50, 52,52, 54,54, 56,56, 66,66, 68,68, 70,70, 72,72], + [51,51, 53,53, 55,55, 57,57, 67,67, 69,69, 71,71, 73,73]], + 'LRB1': [[51, 53, 55, 57, 59, 61], + [52, 54, 56, 58, 60, 62]], + 'LRA1': [[63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93], + [64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94]], + + 'LRSB': [[14]], + 'LRSA': [[45]], + 'LWSB': [[94]], + 'LWSA': [[94]], + 'LCC' : [[95, 95]], + } syncCode = syncTable[1::2] nglshift = nllshift = 14 # vmcnt shift for ngl and nll else: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index b797231dac9..e7764c47050 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -258,7 +258,7 @@ BenchmarkProblems: - LdsPadB: [-1] - LdsBlockSizePerPadA: [-1] - LdsBlockSizePerPadB: [-1] - - LDSTrInst: [0] + - LDSTrInst: [1] - UseSgprForGRO: [0] - UseCustomMainLoopSchedule: [0,1] BenchmarkJoinParameters: From 01936dd2fc7fd2c2903af48963460e6c93368de4 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 26 Nov 2025 06:38:33 +0000 Subject: [PATCH 09/10] typo --- .../hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py | 1 - 1 file changed, 1 deletion(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 67ae92b4979..25d5a4eeb62 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -451,7 +451,6 @@ def _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS): optSchedule = dict() syncCode = [] nglshift = nllshift = 0 # vmcnt shift for ngl and nll - if isTN(kernel) and not useLDSTr and TLDS == 1: #index and code pair syncTable = [-1, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="wait for LRB1-0"), From 18108e3527f588d1fa88f23f458402df506a0a14 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Fri, 28 Nov 2025 01:30:56 +0000 Subject: [PATCH 10/10] change 1st GRB to 51 --- .../tensilelite/Tensile/Components/CustomSchedule.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 25d5a4eeb62..d83a66e4925 100755 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -517,8 +517,8 @@ def _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS): 'GRA': [[15,15, 17,17, 27,27, 29,29, 31,31, 33,33], [16, 16, 18, 18, 28, 28, 30, 30, 32, 32, 34, 34]], - 'GRB': [[50,50, 52,52, 54,54, 56,56, 66,66, 68,68, 70,70, 72,72], - [51,51, 53,53, 55,55, 57,57, 67,67, 69,69, 71,71, 73,73]], + 'GRB': [[51,51, 53,53, 55,55, 57,57, 67,67, 69,69, 71,71, 73,73], + [52,52, 54,54, 56,56, 58,58, 68,68, 70,70, 72,72, 74,74]], 'LRB1': [[51, 53, 55, 57, 59, 61], [52, 54, 56, 58, 60, 62]], 'LRA1': [[63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93],