From 0adc241e33f90382bd66adc55f34518af33802d1 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Tue, 11 Nov 2025 08:08:32 -0700 Subject: [PATCH 1/7] Refactors CustomSchedule.py for scale out of schedules (#2574) ## Motivation We are about to land a significant number of custom schedules for various tile sizes and transpose orientations. By using the convention "a schedule is described in a function" we can reasonably scale the code out, instead of a chain of if/elif blocks that spans hundreds or thousands of lines. Initially we keep the existing schedules in functions inside CustomSchedule.py, but we can refactor as needed. ## Technical Details ## Test Plan TENSILELITE_CLIENT_ARGS="--build-type Debug --gpu-targets gfx950 --clean" tox -e py39 -- Tensile/Tests -k custom_mainloop_scheduling.yaml ## Test Result --- .../Tensile/Components/CustomSchedule.py | 491 +++++++++--------- 1 file changed, 250 insertions(+), 241 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 25803e45345..ed7d59e2efc 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -248,6 +248,253 @@ def scheduleInst2(instList, macroGuard=""): return module, numCodePath +def isNN(kernel): + return not kernel["ProblemType"]["TransposeA"] and not kernel["ProblemType"]["TransposeB"] + +def isNT(kernel): + return not kernel["ProblemType"]["TransposeA"] and kernel["ProblemType"]["TransposeB"] + +def isTT(kernel): + return kernel["ProblemType"]["TransposeA"] and kernel["ProblemType"]["TransposeB"] + +def isTN(kernel): + return kernel["ProblemType"]["TransposeA"] and not kernel["ProblemType"]["TransposeB"] + +def _get_schedule_256x256x64_16bit(kernel, useLDSTr, TLDS): + kernel["MfmaInitCVgprs"] = True + + optSchedule = dict() + syncCode = [] + + if isTN(kernel) and TLDS == 1: + optSchedule = { + 'SYNC' : [[19,20, 50,51, 67,68, 104, 105]], + 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], + 'GRIncB' : [[9,10,11,12,13,14,15,16,17]], + 'LRA0' : [[0,2,4,6,8,10,12,14], + [1,3,5,7,9,11,13,15]], + 'LRB0' : [[24,27,30,33,36,38,40,42], + [22,25,28,31,34,37,39,41]], + 'GRA' : [[21,22, 23,25, 26,28, 29,31, 32,34, 35,52, 53,55, 56,58], + [21,23, 24,26, 27,29, 30,32, 33,35, 36,53, 54,56, 57,59]], + 'GRB' : [[59,61, 62,64, 65,85, 86,87, 88,89, 94,96, 98,100, 102,124], + [60,62, 63,65, 66,84, 85,86, 87,88, 93,95, 97,99, 103,123]], + 'LRA1' : [[69,71,73,75,77,79,81,83], + [70,72,74,76,78,80,82,90]], + 'LRB1' : [[106,108,110,112,114,116,118,120], + [107,109,111,113,115,117,119,121]], + 'LRSA' : [[16]], + 'LRSB' : [[83]], + 'LWSA' : [[125]], + 'LWSB' : [[125]], + 'LCC' : [[126, 126]], + } + syncCode = [SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=(2 + 8 + 8), vscnt=-1, comment="Wait for previous GRA to completely"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=15, vscnt=-1, comment="Wait for previous GRA to completely"), + SBarrier(comment="")] + elif isNT(kernel) and not useLDSTr and TLDS == 0: + kernel["UsePLRPack"] = True + + optSchedule = { + 'SYNC' : [[12,13, 36,44, 56,59, 66,68, 73,92]], + 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], + 'GRIncB' : [[28,29,30,31,32,33,34,35,36]], + 'LRA0' : [[0,0,2,2,4,4,6,6], + [1,1,3,3,5,5,7,7]], + 'LRB0' : [[8,8,10,10,15,15,18,18], + [9,9,11,11,14,14,17,17]], + 'GRA' : [[14,14, 17,17, 20,20, 23,23, 26,26, 45,45, 48,48, 51,51], + [15,15, 18,18, 21,21, 24,24, 27,27, 46,46, 49,49, 52,52]], + 'GRB' : [[54,54, 57,57, 87,87,90,90,93,93,96,96,99,99, 123,123], + [55,55, 58,58, 88,88,91,91,94,94,97,97,100,100, 124,124]], + 'LRA1' : [[60,60,62,62,64,64,66,66], + [61,61,63,63,65,65,67,67]], + 'LRB1' : [[69,69,71,71,73,73,75,75], + [70,70,72,72,74,74,76,76]], + 'LRSA' : [[59]], + 'LRSB' : [[59]], + 'LWSA' : [[125]], + 'LWSB' : [[125]], + 'LCC' : [[126, 126]], + 'PackA0' : [[16,16, 19,19, 21,21, 22,22, 24,24, 25,25, 27,27, 28,28, 29,29, 30,30, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36], + [16,16, 19,19, 20,20, 22,22, 23,23, 25,25, 26,26, 28,28, 29,29, 30,30, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36]], + 'PackB0' : [[37,37, 38,38, 39,39, 40,40, 41,41, 42,42, 43,43, 46,46, 47,47, 49,49, 50,50, 52,52, 53,53, 55,55, 56,56, 58,58], + [37,37, 38,38, 39,39, 40,40, 41,41, 42,42, 43,43, 45,45, 47,47, 48,48, 50,50, 51,51, 53,53, 54,54, 56,56, 57,57]], + 'PackA1' : [[74,74, 76,76, 77,77, 78,78, 79,79, 80,80, 81,81, 82,82, 83,83, 84,84, 85,85, 86,86, 88,88, 89,89, 91,91, 92,92], + [75,75, 77,77, 78,78, 79,79, 80,80, 81,81, 82,82, 83,83, 84,84, 85,85, 86,86, 87,87, 89,89, 90,90, 92,92, 93,93]], + 'PackB1' : [[94,94, 95,95, 97,97, 98,98, 100,100, 101,101, 102,102, 103,103, 104,104, 105,105, 106,106, 107,107, 108,108, 109,109, 110,110, 111,111], + [95,95, 96,96, 98,98, 99,99, 101,101, 102,102, 103,103, 104,104, 105,105, 106,106, 107,107, 108,108, 109,109, 110,110, 111,111, 112,112]], + } + syncCode = [SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=17, vscnt=-1, comment="Wait for GRA to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for GRB to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA1 to complete"), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB1 to complete")] + elif (isNN(kernel) or isTT(kernel)) and not useLDSTr and TLDS == 1: + kernel["UsePLRPack"] = True + + optSchedule = { + 'SYNC' : [[8, 12,13, 36,44, 56,59, 66,68, 73]], + 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], + 'GRIncB' : [[28,29,30,31,32,33,34,35,36]], + 'LRA0' : [[0,0,2,2,4,4,6,6], + [1,1,3,3,5,5,7,7]], + 'LRB0' : [[9,11, 15,18,21,24,27,30], + [10,12, 14,17,20,23,26,29]], + 'GRA' : [[14,14, 17,17, 20,20, 23,23, 26,26, 45,45, 48,48, 51,51], + [15,15, 18,18, 21,21, 24,24, 27,27, 46,46, 49,49, 52,52]], + 'GRB' : [[54,54, 57,57, 87,87,90,90,93,93,96,96,99,99, 123,123], + [55,55, 58,58, 88,88,91,91,94,94,97,97,100,100, 124,124]], + 'LRA1' : [[60,60,62,62,64,64,66,66], + [61,61,63,63,65,65,67,67]], + 'LRB1' : [[68,70,72,74,76,78,80,82], + [69,71,73,75,77,79,81,83]], + 'LRSA' : [[59]], + 'LRSB' : [[59]], + 'LWSA' : [[125]], + 'LWSB' : [[125]], + 'LCC' : [[126, 126]], + 'PackA0' : [[8,8, 16,16, 19,19, 22,22, 25,25, 28,28, 29,29, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36, 37,37, 38,38, 39,39]], + 'PackA1' : [[75,75, 77,77, 79,79, 81,81, 83,83, 84,84, 85,85, 86,86, 88,88, 89,89, 91,91, 92,92, 94,94, 95,95, 97,97, 98,98], + [74,74, 76,76, 78,78, 80,80, 82,82, 84,84, 85,85, 86,86, 87,87, 89,89, 90,90, 92,92, 93,93, 95,95, 96,96, 98,98]], + } + syncCode = [SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 first half to complete"), + SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=17, vscnt=-1, comment="Wait for GRA to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for GRB to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA1 to complete")] + if isTT(kernel): + kernel["SwapGlobalReadOrder"] = True + + optSchedule['GRIncA'], optSchedule['GRIncB'] = optSchedule['GRIncB'], optSchedule['GRIncA'] + optSchedule['LRA0'], optSchedule['LRB0'] = optSchedule['LRB0'], optSchedule['LRA0'] + optSchedule['LRA1'], optSchedule['LRB1'] = optSchedule['LRB1'], optSchedule['LRA1'] + optSchedule['PackB0'] = optSchedule['PackA0'] + optSchedule['PackB1'] = optSchedule['PackA1'] + del optSchedule['PackA0'], optSchedule['PackA1'] + else: + return False, None + + + numMfma = 128 + opt1 = ScheduleInfo(2, numMfma, optSchedule, syncCode) + return True, opt1 + +def _get_schedule_256x256x128_8bit(kernel, useLDSTr, TLDS): + kernel["MfmaInitCVgprs"] = True + + optSchedule = dict() + syncCode = [] + + plr = 3 if kernel["ForceUnrollSubIter"] else 1 + + if isTN(kernel) and TLDS == 1: + optSchedule = { + 'SYNC' : [[6,7, 20,21, 46,47, 61]], + 'GRIncA' : [[0,1,2,3,4,4,4,4,4]], + 'GRIncB' : [[5,5,5,5,5,6,6,6,6]], + 'LRA0' : [[0,0, 1,1, 2,2, 3,3]], + 'GRA' : [[8,8,9,9,10,10,11,11,12,12, 23,23,24,24,25,25]], + 'LRB0' : [[13,13,14,14,15,15,16,16]], + 'LRA%u'%plr : [[48,48,49,49,50,50,51,51]], + 'LRB%u'%plr : [[52,52,54,54,55,55,56,56]], + 'GRB' : [[26,26,27,27, 39,39,40,40,41,41,42,42,43,43, 53,53]], + 'LCC' : [[60, 60]], + 'LRSA' : [[17]], + 'LRSB' : [[17]], + 'LWSA' : [[57]], + 'LWSB' : [[57]], + } + syncCode = [SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0/LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0/LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=15, vscnt=-1, comment="Wait for GRA to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for PLR to complete")] + else: + return False, None + + numMfma = 64 + # B0A0, B0A1, B1A0, B1A1 + mfmaReorder = [] + if not kernel["ForceUnrollSubIter"]: + mfmaReorder = [0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27, + 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31, + 32,33,34,35, 40,41,42,43, 48,49,50,51, 56,57,58,59, + 36,37,38,39, 44,45,46,47, 52,53,54,55, 60,61,62,63] + opt1 = ScheduleInfo(1, numMfma, optSchedule, syncCode, mfmaReorder) + return True, opt1 + +def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): + kernel["MfmaInitCVgprs"] = True + + optSchedule = dict() + syncCode = [] + if isNN(kernel) and useLDSTr and TLDS==1: + # TODO: This schedule can be improved when BC are resolved for MT192 + # Note: A/B Global read orders are swapped + # i.e. GRA contains GR for B + kernel["SwapGlobalReadOrder"] = True + optSchedule = { + 'SYNC' : [[12,13, 47,48,49,50,51, 52,53, 56,56, 94]], + 'GRIncB' : [[0,1,2,3,4,5,6,7,8]], + 'GRIncA' : [[9,10,11,12,13,14,15,16,17]], + 'LRB0' : [[0,0,1,1,2,2,6,8], + [3,3,4,4,5,5,7,9]], + # These local reads have BC + 'LRA0' : [[10, 15,17,19,21,23, 25,27,29,33,37,39], + [11, 14,16,18,20,22, 24,26,28,32,36,38]], + 'GRA' : [[14,14, 16,16, 18,18, 20,20, 22,22, 34,34,36,36,38,38], + [15,15, 17,17, 19,19, 21,21, 23,23, 35,35,37,37,39,39]], + 'GRB' : [[54,54, 56,56, 58,58, 60,60, 62,62, 64,64], + [55,55, 57,57, 59,59, 61,61, 63,63, 65,65]], + 'LRSA' : [[40]], + 'LRSB' : [[40]], + 'LWSB' : [[41]], # For B + 'LWSA' : [[66]], # For A + 'LRB1' : [[57,57,59,59,61,61,63,65], + [58,58,60,60,62,62,64,64]], + 'LRA1' : [[67,71,73,75,77,79,81,85,87,89,91,93], + [68,72,74,76,78,80,82,86,88,90,92,94]], + 'LCC' : [[95, 95]], + } + syncCode = [SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SWaitCnt(dscnt=8, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SWaitCnt(dscnt=6, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SWaitCnt(dscnt=2, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for LRB0 to complete"), + SBarrier(comment=""), + SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),] + + else: + return False, None + + numMfma = 96 + opt1 = ScheduleInfo(2, numMfma, optSchedule, syncCode) + return True, opt1 + + def hasCustomSchedule(kernel): if not kernel["UseCustomMainLoopSchedule"]: @@ -274,249 +521,11 @@ def hasCustomSchedule(kernel): is192x256x64DTL = [MT0, MT1, DU, PGR, PLR, DTL] == [192, 256, 64, 2, 1, True] is256x256x128DTL = [MT0, MT1, DU, PGR, PLR, DTL] == [256, 256, 128, 2, 0, True] - - transA = kernel["ProblemType"]["TransposeA"] - transB = kernel["ProblemType"]["TransposeB"] - - isNN = transA == False and transB == False - isNT = transA == False and transB == True - isTT = transA == True and transB == True - isTN = transA == True and transB == False - - # Custom main loop scheduling for 256x256x64 16bit if is256x256x64DTL and is16bit and not isMixed and ([GRVWA, GRVWB, LRVW] == [8,8,8]) and MI == [16,16,32,1] and MIWG == [2,2]: - - kernel["MfmaInitCVgprs"] = True - - optSchedule = dict() - syncCode = [] - - if isTN and TLDS == 1: - optSchedule = { - 'SYNC' : [[19,20, 50,51, 67,68, 104, 105]], - 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], - 'GRIncB' : [[9,10,11,12,13,14,15,16,17]], - 'LRA0' : [[0,2,4,6,8,10,12,14], - [1,3,5,7,9,11,13,15]], - 'LRB0' : [[24,27,30,33,36,38,40,42], - [22,25,28,31,34,37,39,41]], - 'GRA' : [[21,22, 23,25, 26,28, 29,31, 32,34, 35,52, 53,55, 56,58], - [21,23, 24,26, 27,29, 30,32, 33,35, 36,53, 54,56, 57,59]], - 'GRB' : [[59,61, 62,64, 65,85, 86,87, 88,89, 94,96, 98,100, 102,124], - [60,62, 63,65, 66,84, 85,86, 87,88, 93,95, 97,99, 103,123]], - 'LRA1' : [[69,71,73,75,77,79,81,83], - [70,72,74,76,78,80,82,90]], - 'LRB1' : [[106,108,110,112,114,116,118,120], - [107,109,111,113,115,117,119,121]], - 'LRSA' : [[16]], - 'LRSB' : [[83]], - 'LWSA' : [[125]], - 'LWSB' : [[125]], - 'LCC' : [[126, 126]], - } - syncCode = [SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=(2 + 8 + 8), vscnt=-1, comment="Wait for previous GRA to completely"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=15, vscnt=-1, comment="Wait for previous GRA to completely"), - SBarrier(comment="")] - elif isNT and not useLDSTr and TLDS == 0: - kernel["UsePLRPack"] = True - - optSchedule = { - 'SYNC' : [[12,13, 36,44, 56,59, 66,68, 73,92]], - 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], - 'GRIncB' : [[28,29,30,31,32,33,34,35,36]], - 'LRA0' : [[0,0,2,2,4,4,6,6], - [1,1,3,3,5,5,7,7]], - 'LRB0' : [[8,8,10,10,15,15,18,18], - [9,9,11,11,14,14,17,17]], - 'GRA' : [[14,14, 17,17, 20,20, 23,23, 26,26, 45,45, 48,48, 51,51], - [15,15, 18,18, 21,21, 24,24, 27,27, 46,46, 49,49, 52,52]], - 'GRB' : [[54,54, 57,57, 87,87,90,90,93,93,96,96,99,99, 123,123], - [55,55, 58,58, 88,88,91,91,94,94,97,97,100,100, 124,124]], - 'LRA1' : [[60,60,62,62,64,64,66,66], - [61,61,63,63,65,65,67,67]], - 'LRB1' : [[69,69,71,71,73,73,75,75], - [70,70,72,72,74,74,76,76]], - 'LRSA' : [[59]], - 'LRSB' : [[59]], - 'LWSA' : [[125]], - 'LWSB' : [[125]], - 'LCC' : [[126, 126]], - 'PackA0' : [[16,16, 19,19, 21,21, 22,22, 24,24, 25,25, 27,27, 28,28, 29,29, 30,30, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36], - [16,16, 19,19, 20,20, 22,22, 23,23, 25,25, 26,26, 28,28, 29,29, 30,30, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36]], - 'PackB0' : [[37,37, 38,38, 39,39, 40,40, 41,41, 42,42, 43,43, 46,46, 47,47, 49,49, 50,50, 52,52, 53,53, 55,55, 56,56, 58,58], - [37,37, 38,38, 39,39, 40,40, 41,41, 42,42, 43,43, 45,45, 47,47, 48,48, 50,50, 51,51, 53,53, 54,54, 56,56, 57,57]], - 'PackA1' : [[74,74, 76,76, 77,77, 78,78, 79,79, 80,80, 81,81, 82,82, 83,83, 84,84, 85,85, 86,86, 88,88, 89,89, 91,91, 92,92], - [75,75, 77,77, 78,78, 79,79, 80,80, 81,81, 82,82, 83,83, 84,84, 85,85, 86,86, 87,87, 89,89, 90,90, 92,92, 93,93]], - 'PackB1' : [[94,94, 95,95, 97,97, 98,98, 100,100, 101,101, 102,102, 103,103, 104,104, 105,105, 106,106, 107,107, 108,108, 109,109, 110,110, 111,111], - [95,95, 96,96, 98,98, 99,99, 101,101, 102,102, 103,103, 104,104, 105,105, 106,106, 107,107, 108,108, 109,109, 110,110, 111,111, 112,112]], - } - syncCode = [SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=17, vscnt=-1, comment="Wait for GRA to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for GRB to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA1 to complete"), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB1 to complete")] - elif (isNN or isTT) and not useLDSTr and TLDS == 1: - kernel["UsePLRPack"] = True - - optSchedule = { - 'SYNC' : [[8, 12,13, 36,44, 56,59, 66,68, 73]], - 'GRIncA' : [[0,1,2,3,4,5,6,7,8]], - 'GRIncB' : [[28,29,30,31,32,33,34,35,36]], - 'LRA0' : [[0,0,2,2,4,4,6,6], - [1,1,3,3,5,5,7,7]], - 'LRB0' : [[9,11, 15,18,21,24,27,30], - [10,12, 14,17,20,23,26,29]], - 'GRA' : [[14,14, 17,17, 20,20, 23,23, 26,26, 45,45, 48,48, 51,51], - [15,15, 18,18, 21,21, 24,24, 27,27, 46,46, 49,49, 52,52]], - 'GRB' : [[54,54, 57,57, 87,87,90,90,93,93,96,96,99,99, 123,123], - [55,55, 58,58, 88,88,91,91,94,94,97,97,100,100, 124,124]], - 'LRA1' : [[60,60,62,62,64,64,66,66], - [61,61,63,63,65,65,67,67]], - 'LRB1' : [[68,70,72,74,76,78,80,82], - [69,71,73,75,77,79,81,83]], - 'LRSA' : [[59]], - 'LRSB' : [[59]], - 'LWSA' : [[125]], - 'LWSB' : [[125]], - 'LCC' : [[126, 126]], - 'PackA0' : [[8,8, 16,16, 19,19, 22,22, 25,25, 28,28, 29,29, 31,31, 32,32, 33,33, 34,34, 35,35, 36,36, 37,37, 38,38, 39,39]], - 'PackA1' : [[75,75, 77,77, 79,79, 81,81, 83,83, 84,84, 85,85, 86,86, 88,88, 89,89, 91,91, 92,92, 94,94, 95,95, 97,97, 98,98], - [74,74, 76,76, 78,78, 80,80, 82,82, 84,84, 85,85, 86,86, 87,87, 89,89, 90,90, 92,92, 93,93, 95,95, 96,96, 98,98]], - } - syncCode = [SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 first half to complete"), - SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=17, vscnt=-1, comment="Wait for GRA to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for GRB to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA1 to complete")] - if isTT: - kernel["SwapGlobalReadOrder"] = True - - optSchedule['GRIncA'], optSchedule['GRIncB'] = optSchedule['GRIncB'], optSchedule['GRIncA'] - optSchedule['LRA0'], optSchedule['LRB0'] = optSchedule['LRB0'], optSchedule['LRA0'] - optSchedule['LRA1'], optSchedule['LRB1'] = optSchedule['LRB1'], optSchedule['LRA1'] - optSchedule['PackB0'] = optSchedule['PackA0'] - optSchedule['PackB1'] = optSchedule['PackA1'] - del optSchedule['PackA0'], optSchedule['PackA1'] - else: - return False, None - - - numMfma = 128 - opt1 = ScheduleInfo(2, numMfma, optSchedule, syncCode) - return True, opt1 + return _get_schedule_256x256x64_16bit(kernel, useLDSTr, TLDS) elif is256x256x128DTL and is8bit and not isMixed and ([GRVWA, GRVWB, LRVW] == [16, 16, 16]) and MI == [16,16,128,1] and MIWG == [2,2]: - - kernel["MfmaInitCVgprs"] = True - - optSchedule = dict() - syncCode = [] - - plr = 3 if kernel["ForceUnrollSubIter"] else 1 - - if isTN and TLDS == 1: - optSchedule = { - 'SYNC' : [[6,7, 20,21, 46,47, 61]], - 'GRIncA' : [[0,1,2,3,4,4,4,4,4]], - 'GRIncB' : [[5,5,5,5,5,6,6,6,6]], - 'LRA0' : [[0,0, 1,1, 2,2, 3,3]], - 'GRA' : [[8,8,9,9,10,10,11,11,12,12, 23,23,24,24,25,25]], - 'LRB0' : [[13,13,14,14,15,15,16,16]], - 'LRA%u'%plr : [[48,48,49,49,50,50,51,51]], - 'LRB%u'%plr : [[52,52,54,54,55,55,56,56]], - 'GRB' : [[26,26,27,27, 39,39,40,40,41,41,42,42,43,43, 53,53]], - 'LCC' : [[60, 60]], - 'LRSA' : [[17]], - 'LRSB' : [[17]], - 'LWSA' : [[57]], - 'LWSB' : [[57]], - } - syncCode = [SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0/LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0/LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=15, vscnt=-1, comment="Wait for GRA to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for PLR to complete")] - else: - return False, None - - numMfma = 64 - # B0A0, B0A1, B1A0, B1A1 - mfmaReorder = [] - if not kernel["ForceUnrollSubIter"]: - mfmaReorder = [0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27, - 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31, - 32,33,34,35, 40,41,42,43, 48,49,50,51, 56,57,58,59, - 36,37,38,39, 44,45,46,47, 52,53,54,55, 60,61,62,63] - opt1 = ScheduleInfo(1, numMfma, optSchedule, syncCode, mfmaReorder) - return True, opt1 + return _get_schedule_256x256x128_8bit(kernel, useLDSTr, TLDS) elif is192x256x64DTL and is16bit and not isMixed and ([GRVWA, GRVWB, LRVW] == [8, 8, 8]) and MI == [16,16,32,1] and MIWG == [2,2]: - - kernel["MfmaInitCVgprs"] = True - - optSchedule = dict() - syncCode = [] - if isNN and useLDSTr and TLDS==1: - # TODO: This schedule can be improved when BC are resolved for MT192 - # Note: A/B Global read orders are swapped - # i.e. GRA contains GR for B - kernel["SwapGlobalReadOrder"] = True - optSchedule = { - 'SYNC' : [[12,13, 47,48,49,50,51, 52,53, 56,56, 94]], - 'GRIncB' : [[0,1,2,3,4,5,6,7,8]], - 'GRIncA' : [[9,10,11,12,13,14,15,16,17]], - 'LRB0' : [[0,0,1,1,2,2,6,8], - [3,3,4,4,5,5,7,9]], - # These local reads have BC - 'LRA0' : [[10, 15,17,19,21,23, 25,27,29,33,37,39], - [11, 14,16,18,20,22, 24,26,28,32,36,38]], - 'GRA' : [[14,14, 16,16, 18,18, 20,20, 22,22, 34,34,36,36,38,38], - [15,15, 17,17, 19,19, 21,21, 23,23, 35,35,37,37,39,39]], - 'GRB' : [[54,54, 56,56, 58,58, 60,60, 62,62, 64,64], - [55,55, 57,57, 59,59, 61,61, 63,63, 65,65]], - 'LRSA' : [[40]], - 'LRSB' : [[40]], - 'LWSB' : [[41]], # For B - 'LWSA' : [[66]], # For A - 'LRB1' : [[57,57,59,59,61,61,63,65], - [58,58,60,60,62,62,64,64]], - 'LRA1' : [[67,71,73,75,77,79,81,85,87,89,91,93], - [68,72,74,76,78,80,82,86,88,90,92,94]], - 'LCC' : [[95, 95]], - } - syncCode = [SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=10, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SWaitCnt(dscnt=8, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SWaitCnt(dscnt=6, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SWaitCnt(dscnt=2, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRA0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=-1, vlcnt=9, vscnt=-1, comment="Wait for LRB0 to complete"), - SBarrier(comment=""), - SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),] - - else: - return False, None - - numMfma = 96 - opt1 = ScheduleInfo(2, numMfma, optSchedule, syncCode) - return True, opt1 + return _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS) return False, None From 7641aa746d7832a335266bef040ae137d6f23a7d Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 12 Nov 2025 05:51:21 +0000 Subject: [PATCH 2/7] add 192x256x64TN --- .../Tensile/Components/CustomSchedule.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index ed7d59e2efc..c6c9084e22c 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -487,6 +487,42 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): SBarrier(comment=""), SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),] + elif isTN and not useLDSTr and TLDS == 1: + #index and code pair + syncTable = [8, SBarrier(comment="for GRA start"), + 23, SWaitCnt(dscnt=17, vlcnt=-1, vscnt=-1, comment="for LRB1-4"), + 29, SWaitCnt(dscnt=16, vlcnt=-1, vscnt=-1, comment="for LRB1-5"), + 35, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-6"), + 41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"), + 47, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="for LRA/B1"), + 46, SBarrier(comment="for GRB start"),] + optSchedule = { + 'SYNC' : [syncTable[::2]], + 'GRIncA': [[0,1,2,3,4,5,6,7,8]], + 'GRIncB': [[9,10,11,12,13,14,15,16,17]], + + 'LRA0' : [[0, 2, 3, 4, 5, 6]], + 'LRB0' : [[7, 9, 11, 13, 15, 17, 19, 21], + [8, 10, 12, 14, 16, 18, 20, 22]], + 'GRA' : [[8,8, 10,10, 12,12, 14,14, 26,26, 31,31], + [9,9, 11,11, 13,13, 15,15, 27,27, 32,32]], + + 'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 77,77], + [47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 78,78]], + 'LRA1' : [[48, 52, 56, 58, 60, 64], + [49, 53, 57, 59, 61, 65]], + # 0 1 2 3 4 5 6 7 + 'LRB1' : [[78, 80, 82, 84, 86, 90, 92, 94], + [79, 81, 83, 85, 87, 91, 93, 95]], + + 'LRSA' : [[22]], + 'LRSB' : [[23]], + + 'LWSA' : [[20]], + 'LWSB' : [[78]], + 'LCC' : [[95, 95]], + } + syncCode = syncTable[1::2] else: return False, None From 8f47b794fe3f42815335f66d5f5d107830f9858f Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 12 Nov 2025 05:52:51 +0000 Subject: [PATCH 3/7] typo --- .../hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index c6c9084e22c..a761d940aba 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -487,7 +487,7 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): SBarrier(comment=""), SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),] - elif isTN and not useLDSTr and TLDS == 1: + elif isTN(kernel) and not useLDSTr and TLDS == 1: #index and code pair syncTable = [8, SBarrier(comment="for GRA start"), 23, SWaitCnt(dscnt=17, vlcnt=-1, vscnt=-1, comment="for LRB1-4"), From 1b39b510c93c55a8814cfaff3e4ef8100b9a66b1 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Mon, 10 Nov 2025 10:49:56 +0000 Subject: [PATCH 4/7] add test --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 246 ++++++++++++++++++ .../gfx950/custom_mainloop_scheduling.yaml | 34 ++- 2 files changed, 279 insertions(+), 1 deletion(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index 5b2ca51968f..0ceb68faa3b 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -301206,6 +301206,250 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iMO6yjTG4yuZPDkcMXrcObwuUOXV_ipLalV5INSQqYE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123776 + LdsInitCVgprs: false + LdsNumBytes: 123776 + LdsNumElementsAlignedA: 24960 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24960 + LdsOffsetB_Blk: 90496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24960 + LdsOffsetMetadata_Blk: 90496 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: 6 + NumTotalPackedLoadsB: 8 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1289 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [16, 368640, 1, 224] - [0, 0.0] @@ -303807,6 +304051,8 @@ - [1287, 0.0] - - [16032, 5, 1, 16384] - [1288, 0.0] + - - [3072, 4096, 1, 8192] + - [1289, 0.0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index 8d0f8886001..d28cc62bdf9 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -386,7 +386,39 @@ BenchmarkProblems: - Range: [[613], [612], [1], [1, 1, 64]] - Exact: [8192, 4096, 1, 2048] - BiasTypeArgs: ['b'] - + - # BenchmarkProblemSizeGroup - Standard - All problem + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16,32, 1, 1, 6, 8, 2,2 ] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - DepthU: [64] + - ScheduleIterAlg: [3] + - ExpandPointerSwap: [0] + - TransposeLDS: [1] #0,1 + - LocalReadVectorWidth: [8] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - DirectToLds: [1] + - StreamK: [3] + - LdsPadA: [8] #[-1] + - LdsPadB: [8] #[-1] + - StaggerU: [0] + - WorkGroupMapping: [16] + - WorkGroupMappingXCC: [2] + - 1LDSBuffer: [0] + - NonTemporalD: [4] + - SourceSwap: [1] + - UseSgprForGRO: [0] + - UseCustomMainLoopSchedule: [0, 1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [3072, 4096, 1, 8192] + - BiasTypeArgs: ['b'] ######################################## # HHS TN - standard ######################################## From 485c703df1c3c2c929494076f66935ef0affca25 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 12 Nov 2025 06:51:41 +0000 Subject: [PATCH 5/7] more test --- .../Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index d28cc62bdf9..524d495ec4c 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -417,6 +417,10 @@ BenchmarkProblems: BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: + - Range: [[192], [256], [1], [64, 64, 256]] + - Range: [[192], [256], [1], [1, 1, 64]] + - Range: [[192], [256], [1], [32, 64, 256]] + - Range: [[6144], [8192], [1], [64, 64, 256]] - Exact: [3072, 4096, 1, 8192] - BiasTypeArgs: ['b'] ######################################## From 5a943a53d98c69e1f25907eb8a072a030967d941 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Fri, 14 Nov 2025 01:48:07 +0000 Subject: [PATCH 6/7] typo --- .../gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index 45b71a16334..9c4bd13b8f6 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -308981,7 +308981,6 @@ - [1306, 0.0] - - [3072, 4096, 1, 8192] - [1308, 0.0] - - null - null - DeviceEfficiency From 461fb4f39c9833c20e108e70de6a619f67c6fe84 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Mon, 17 Nov 2025 03:35:50 +0000 Subject: [PATCH 7/7] fix random fail --- .../Tensile/Components/CustomSchedule.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index 1cc3db43c28..aaf95fd3d52 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -347,13 +347,17 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): nglshift = nllshift = 14 # vmcnt shift for ngl and nll elif isTN(kernel) and not useLDSTr and TLDS == 1: #index and code pair - syncTable = [8, SBarrier(comment="for GRA start"), - 23, SWaitCnt(dscnt=17, vlcnt=-1, vscnt=-1, comment="for LRB1-4"), - 29, SWaitCnt(dscnt=16, vlcnt=-1, vscnt=-1, comment="for LRB1-5"), - 35, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-6"), - 41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"), - 47, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="for LRA/B1"), - 46, SBarrier(comment="for GRB start"),] + syncTable = [-1, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB1-0"), + 6, SWaitCnt(dscnt=6+5, vlcnt=-1, vscnt=-1, comment="for LRB1-1"), + 8, SBarrier(comment="for GRA start"), + 11, SWaitCnt(dscnt=5+8, vlcnt=-1, vscnt=-1, comment="for LRB1-2"), + 17, SWaitCnt(dscnt=4+11, vlcnt=-1, vscnt=-1, comment="for LRB1-3"), + 23, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-4:6"), + 41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"), + 46, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"), + 48, SBarrier(comment="for LRA1 start"), + 78, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRB1"), + 78, SBarrier(comment="for LRB1 start"),] optSchedule = { 'SYNC' : [syncTable[::2]], 'GRIncA': [[0,1,2,3,4,5,6,7,8]], @@ -365,11 +369,11 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): 'GRA' : [[8,8, 10,10, 12,12, 14,14, 26,26, 31,31], [9,9, 11,11, 13,13, 15,15, 27,27, 32,32]], - 'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 77,77], - [47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 78,78]], + 'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 76,76], + [47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 77,77]], 'LRA1' : [[48, 52, 56, 58, 60, 64], [49, 53, 57, 59, 61, 65]], - # 0 1 2 3 4 5 6 7 + # 0 1 2 3 4 5 6 7 'LRB1' : [[78, 80, 82, 84, 86, 90, 92, 94], [79, 81, 83, 85, 87, 91, 93, 95]], @@ -381,6 +385,7 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): 'LCC' : [[95, 95]], } syncCode = syncTable[1::2] + nglshift = nllshift = 14 # vmcnt shift for ngl and nll else: return False, None